From dd9b55d3d923a40aefd5441a2345f309cf2e156d Mon Sep 17 00:00:00 2001 From: dim Date: Fri, 6 Jan 2017 20:13:21 +0000 Subject: [PATCH] Vendor import of llvm trunk r291274: https://llvm.org/svn/llvm-project/llvm/trunk@291274 --- cmake/config-ix.cmake | 7 +- cmake/modules/AddLLVM.cmake | 4 + docs/CompileCudaWithLLVM.rst | 4 +- docs/Phabricator.rst | 2 +- include/llvm/Analysis/CGSCCPassManager.h | 2 +- include/llvm/Analysis/TargetTransformInfo.h | 19 +- .../llvm/Analysis/TargetTransformInfoImpl.h | 30 +- include/llvm/Bitcode/BitCodes.h | 6 +- include/llvm/Bitcode/BitstreamReader.h | 11 +- include/llvm/Bitcode/BitstreamWriter.h | 26 +- include/llvm/CodeGen/AsmPrinter.h | 7 + include/llvm/CodeGen/BasicTTIImpl.h | 5 +- include/llvm/CodeGen/DIE.h | 9 +- .../llvm/CodeGen/GlobalISel/IRTranslator.h | 6 +- include/llvm/CodeGen/MachineBasicBlock.h | 2 +- include/llvm/DebugInfo/DWARF/DWARFDie.h | 58 +- .../llvm/ExecutionEngine/Orc/RawByteChannel.h | 2 +- include/llvm/IR/ModuleSummaryIndex.h | 112 +- include/llvm/IR/ModuleSummaryIndexYAML.h | 111 + include/llvm/IR/PassManager.h | 250 +- include/llvm/LTO/LTO.h | 7 + include/llvm/MC/MCTargetOptions.h | 10 +- include/llvm/Support/FileSystem.h | 19 +- include/llvm/Support/TarWriter.h | 32 + include/llvm/Transforms/IPO/FunctionImport.h | 13 +- include/llvm/Transforms/IPO/LowerTypeTests.h | 4 - .../Transforms/Utils/FunctionImportUtils.h | 18 + include/llvm/module.modulemap | 1 + lib/Analysis/ModuleSummaryAnalysis.cpp | 120 +- lib/Analysis/TargetTransformInfo.cpp | 5 +- lib/Bitcode/Reader/BitcodeReader.cpp | 16 +- lib/Bitcode/Reader/BitstreamReader.cpp | 22 +- lib/Bitcode/Reader/MetadataLoader.cpp | 400 +++- lib/Bitcode/Writer/BitcodeWriter.cpp | 191 +- lib/CodeGen/AsmPrinter/ARMException.cpp | 3 +- lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 24 +- .../AsmPrinter/AsmPrinterInlineAsm.cpp | 2 + lib/CodeGen/AsmPrinter/DwarfCFIException.cpp | 2 +- lib/CodeGen/GlobalISel/IRTranslator.cpp | 44 +- lib/CodeGen/GlobalISel/RegisterBankInfo.cpp | 13 +- lib/CodeGen/IfConversion.cpp | 42 +- lib/CodeGen/MIRPrinter.cpp | 8 +- lib/CodeGen/MachineBasicBlock.cpp | 9 +- lib/CodeGen/MachineVerifier.cpp | 18 +- lib/CodeGen/RegisterScavenging.cpp | 5 - .../SelectionDAG/SelectionDAGBuilder.cpp | 17 +- lib/DebugInfo/DWARF/DWARFDie.cpp | 5 +- lib/Fuzzer/FuzzerDriver.cpp | 1 + lib/Fuzzer/FuzzerFlags.def | 1 + lib/Fuzzer/FuzzerIO.h | 3 + lib/Fuzzer/FuzzerIOPosix.cpp | 6 + lib/Fuzzer/FuzzerIOWindows.cpp | 2 + lib/Fuzzer/FuzzerInternal.h | 1 + lib/Fuzzer/FuzzerLoop.cpp | 5 + lib/Fuzzer/FuzzerMerge.cpp | 12 +- lib/Fuzzer/FuzzerOptions.h | 1 + lib/Fuzzer/FuzzerTraceState.cpp | 47 - lib/Fuzzer/FuzzerUtilPosix.cpp | 6 + lib/Fuzzer/FuzzerUtilWindows.cpp | 1 + lib/Fuzzer/test/merge.test | 8 + lib/LTO/LTO.cpp | 41 +- lib/LTO/ThinLTOCodeGenerator.cpp | 42 +- lib/Support/APInt.cpp | 2 +- lib/Support/CMakeLists.txt | 1 + lib/Support/Host.cpp | 20 +- lib/Support/TarWriter.cpp | 166 ++ lib/Support/Unix/Signals.inc | 2 +- lib/Target/AArch64/AArch64CollectLOH.cpp | 1123 +++------ lib/Target/AArch64/AArch64ISelLowering.cpp | 110 +- lib/Target/AArch64/AArch64InstrInfo.cpp | 190 +- lib/Target/AArch64/AArch64InstrInfo.h | 4 + .../AArch64/AArch64InstructionSelector.cpp | 10 +- .../AArch64/AArch64InstructionSelector.h | 8 +- .../AArch64/AArch64MachineFunctionInfo.h | 48 +- lib/Target/AArch64/AArch64Subtarget.cpp | 1 + lib/Target/AArch64/AArch64TargetMachine.cpp | 38 +- .../AArch64/AArch64TargetTransformInfo.cpp | 7 +- .../AArch64/AArch64TargetTransformInfo.h | 2 +- .../AArch64/AsmParser/AArch64AsmParser.cpp | 89 +- .../Disassembler/AArch64Disassembler.h | 9 +- .../MCTargetDesc/AArch64ELFObjectWriter.cpp | 13 +- .../MCTargetDesc/AArch64MCCodeEmitter.cpp | 27 +- .../MCTargetDesc/AArch64TargetStreamer.cpp | 3 +- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 10 +- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 7 +- lib/Target/ARM/ARMTargetTransformInfo.h | 3 +- lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp | 48 +- .../Lanai/Disassembler/LanaiDisassembler.h | 7 +- .../Lanai/InstPrinter/LanaiInstPrinter.h | 13 +- lib/Target/Lanai/LanaiISelLowering.cpp | 42 +- lib/Target/Lanai/LanaiRegisterInfo.h | 9 +- .../MCTargetDesc/LanaiELFObjectWriter.cpp | 12 +- .../Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp | 29 +- .../Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp | 15 +- lib/Target/PowerPC/PPCISelLowering.h | 2 +- lib/Target/PowerPC/PPCInstr64Bit.td | 3 + lib/Target/PowerPC/PPCInstrFormats.td | 6 + lib/Target/PowerPC/PPCInstrInfo.td | 8 + lib/Target/X86/X86ISelLowering.cpp | 172 +- lib/Target/X86/X86TargetTransformInfo.cpp | 630 +++--- lib/Target/X86/X86TargetTransformInfo.h | 3 +- lib/Transforms/IPO/FunctionImport.cpp | 182 +- lib/Transforms/IPO/LowerTypeTests.cpp | 316 ++- .../InstCombine/InstCombineCalls.cpp | 14 + .../Instrumentation/AddressSanitizer.cpp | 96 +- lib/Transforms/Scalar/GVN.cpp | 14 +- lib/Transforms/Scalar/LICM.cpp | 37 +- lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 2 +- lib/Transforms/Scalar/LoopSink.cpp | 3 +- lib/Transforms/Utils/FunctionImportUtils.cpp | 24 +- lib/Transforms/Vectorize/LoopVectorize.cpp | 59 +- test/Analysis/CostModel/AArch64/bswap.ll | 70 + test/Analysis/CostModel/AArch64/falkor.ll | 26 + test/Analysis/CostModel/AArch64/gep.ll | 66 +- test/Analysis/CostModel/X86/arith.ll | 4 +- .../CostModel/X86/shuffle-broadcast.ll | 140 +- test/Analysis/CostModel/X86/vdiv-cost.ll | 66 +- .../CostModel/X86/vshift-ashr-cost.ll | 256 ++- .../CostModel/X86/vshift-lshr-cost.ll | 259 ++- .../Analysis/CostModel/X86/vshift-shl-cost.ll | 261 ++- test/Bitcode/summary_version.ll | 2 +- test/Bitcode/thinlto-function-summary.ll | 2 +- test/Bitcode/thinlto-summary-section.ll | 8 +- .../GlobalISel/arm64-instructionselect.mir | 6 +- .../AArch64/GlobalISel/arm64-irtranslator.ll | 88 +- .../GlobalISel/irtranslator-exceptions.ll | 4 +- .../arm64-collect-loh-garbage-crash.ll | 2 +- test/CodeGen/AArch64/arm64-collect-loh-str.ll | 2 +- test/CodeGen/AArch64/arm64-collect-loh.ll | 17 +- test/CodeGen/AArch64/loh.mir | 193 ++ test/CodeGen/AArch64/machine-scheduler.mir | 5 +- test/CodeGen/AMDGPU/hsa-func.ll | 3 +- test/CodeGen/AMDGPU/hsa.ll | 4 +- test/CodeGen/Generic/cfi-sections.ll | 39 + test/CodeGen/MIR/AArch64/spill-fold.mir | 82 + test/CodeGen/MIR/X86/basic-block-liveins.mir | 9 +- test/CodeGen/MIR/X86/machine-verifier.mir | 3 +- test/CodeGen/NVPTX/tid-range.ll | 18 + .../X86/GlobalISel/irtranslator-call.ll | 2 +- test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 7 +- test/CodeGen/X86/avx512-trunc.ll | 107 + test/CodeGen/X86/cmov.ll | 18 + test/CodeGen/X86/lower-vec-shift-2.ll | 18 +- test/CodeGen/X86/shuffle-vs-trunc-128.ll | 481 ++++ test/CodeGen/X86/shuffle-vs-trunc-256.ll | 629 ++++++ test/CodeGen/X86/shuffle-vs-trunc-512.ll | 537 +++++ test/CodeGen/X86/tail-call-conditional.mir | 3 +- test/CodeGen/X86/vector-rotate-128.ll | 2 - test/CodeGen/X86/vector-shift-ashr-128.ll | 100 +- test/CodeGen/X86/vector-shift-ashr-256.ll | 74 +- test/CodeGen/X86/vector-shift-ashr-512.ll | 321 ++- test/CodeGen/X86/vector-shift-lshr-128.ll | 74 +- test/CodeGen/X86/vector-shift-lshr-256.ll | 74 +- test/CodeGen/X86/vector-shift-lshr-512.ll | 317 ++- test/CodeGen/X86/vector-shift-shl-128.ll | 74 +- test/CodeGen/X86/vector-shift-shl-256.ll | 74 +- test/CodeGen/X86/vector-shift-shl-512.ll | 317 ++- test/CodeGen/X86/vector-shuffle-128-v16.ll | 89 +- test/CodeGen/X86/vector-shuffle-128-v4.ll | 83 +- test/CodeGen/X86/vector-shuffle-128-v8.ll | 112 +- test/CodeGen/X86/vector-shuffle-masked.ll | 450 ++++ test/CodeGen/X86/vector-tzcnt-128.ll | 425 +--- test/CodeGen/X86/vshift-4.ll | 2 - .../DebugInfo/Generic/licm-hoist-debug-loc.ll | 75 + .../asan-masked-load-store.ll | 76 +- test/MC/AsmParser/Inputs/function.x | 3 + test/MC/AsmParser/Inputs/module.x | 3 + test/MC/AsmParser/include.ll | 13 + .../PowerPC/ppc64-encoding-fp.txt | 18 + test/MC/PowerPC/ppc64-encoding-fp.s | 32 +- test/ThinLTO/X86/Inputs/deadstrip.ll | 22 + test/ThinLTO/X86/Inputs/lazyload_metadata.ll | 12 + test/ThinLTO/X86/deadstrip.ll | 109 + test/ThinLTO/X86/lazyload_metadata.ll | 54 + test/Transforms/GVN/PRE/phi-translate.ll | 13 +- .../InstCombine/amdgcn-intrinsics.ll | 34 + test/Transforms/InstCombine/cos-intrinsic.ll | 55 + test/Transforms/InstCombine/icmp-shl-nsw.ll | 218 ++ test/Transforms/InstCombine/icmp.ll | 63 - test/Transforms/InstSimplify/select.ll | 28 + test/Transforms/LICM/scalar_promote.ll | 192 ++ .../LoopVectorize/X86/strided_load_cost.ll | 54 + .../LowerTypeTests/Inputs/import-unsat.yaml | 10 + .../LowerTypeTests/export-nothing.ll | 7 + .../LowerTypeTests/function-disjoint.ll | 6 +- .../Transforms/LowerTypeTests/function-ext.ll | 3 +- test/Transforms/LowerTypeTests/function.ll | 2 +- .../Transforms/LowerTypeTests/import-unsat.ll | 23 + test/Transforms/LowerTypeTests/simple.ll | 2 +- .../LowerTypeTests/single-offset.ll | 2 +- test/Transforms/LowerTypeTests/unsat.ll | 3 +- tools/dsymutil/DwarfLinker.cpp | 12 +- tools/llc/llc.cpp | 3 + tools/llvm-config/llvm-config.cpp | 10 +- unittests/ADT/APFloatTest.cpp | 275 ++- unittests/ADT/IntrusiveRefCntPtrTest.cpp | 4 +- unittests/Bitcode/BitstreamReaderTest.cpp | 4 +- .../DebugInfo/DWARF/DWARFDebugInfoTest.cpp | 121 + unittests/DebugInfo/DWARF/DwarfGenerator.cpp | 4 + unittests/DebugInfo/DWARF/DwarfGenerator.h | 3 + utils/lit/lit/formats/googletest.py | 8 +- utils/unittest/CMakeLists.txt | 6 + utils/unittest/googletest/README.LLVM | 16 +- .../include/gtest/gtest-death-test.h | 17 +- .../googletest/include/gtest/gtest-message.h | 112 +- .../include/gtest/gtest-param-test.h | 56 +- .../googletest/include/gtest/gtest-printers.h | 335 ++- .../googletest/include/gtest/gtest-spi.h | 7 +- .../include/gtest/gtest-test-part.h | 23 +- .../include/gtest/gtest-typed-test.h | 11 +- .../unittest/googletest/include/gtest/gtest.h | 565 +++-- .../include/gtest/gtest_pred_impl.h | 12 +- .../gtest/internal/custom/gtest-port.h | 69 + .../gtest/internal/custom/gtest-printers.h | 42 + .../include/gtest/internal/custom/gtest.h | 41 + .../internal/gtest-death-test-internal.h | 29 +- .../include/gtest/internal/gtest-filepath.h | 16 +- .../include/gtest/internal/gtest-internal.h | 403 ++-- .../include/gtest/internal/gtest-linked_ptr.h | 22 +- .../internal/gtest-param-util-generated.h | 679 ++++-- .../include/gtest/internal/gtest-param-util.h | 190 +- .../include/gtest/internal/gtest-port-arch.h | 97 + .../include/gtest/internal/gtest-port.h | 1219 ++++++++-- .../include/gtest/internal/gtest-string.h | 217 +- .../include/gtest/internal/gtest-tuple.h | 100 +- .../include/gtest/internal/gtest-type-util.h | 21 +- .../googletest/src/gtest-death-test.cc | 344 ++- .../unittest/googletest/src/gtest-filepath.cc | 43 +- .../googletest/src/gtest-internal-inl.h | 332 ++- utils/unittest/googletest/src/gtest-port.cc | 699 +++++- .../unittest/googletest/src/gtest-printers.cc | 115 +- .../googletest/src/gtest-test-part.cc | 12 +- .../googletest/src/gtest-typed-test.cc | 42 +- utils/unittest/googletest/src/gtest.cc | 2012 +++++++++++------ 235 files changed, 14791 insertions(+), 6255 deletions(-) create mode 100644 include/llvm/IR/ModuleSummaryIndexYAML.h create mode 100644 include/llvm/Support/TarWriter.h create mode 100644 lib/Support/TarWriter.cpp create mode 100644 test/Analysis/CostModel/AArch64/bswap.ll create mode 100644 test/Analysis/CostModel/AArch64/falkor.ll create mode 100644 test/CodeGen/AArch64/loh.mir create mode 100644 test/CodeGen/Generic/cfi-sections.ll create mode 100644 test/CodeGen/MIR/AArch64/spill-fold.mir create mode 100644 test/CodeGen/NVPTX/tid-range.ll create mode 100644 test/CodeGen/X86/shuffle-vs-trunc-128.ll create mode 100644 test/CodeGen/X86/shuffle-vs-trunc-256.ll create mode 100644 test/CodeGen/X86/shuffle-vs-trunc-512.ll create mode 100644 test/DebugInfo/Generic/licm-hoist-debug-loc.ll create mode 100644 test/MC/AsmParser/Inputs/function.x create mode 100644 test/MC/AsmParser/Inputs/module.x create mode 100644 test/MC/AsmParser/include.ll create mode 100644 test/ThinLTO/X86/Inputs/deadstrip.ll create mode 100644 test/ThinLTO/X86/Inputs/lazyload_metadata.ll create mode 100644 test/ThinLTO/X86/deadstrip.ll create mode 100644 test/ThinLTO/X86/lazyload_metadata.ll create mode 100644 test/Transforms/InstCombine/icmp-shl-nsw.ll create mode 100644 test/Transforms/LoopVectorize/X86/strided_load_cost.ll create mode 100644 test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml create mode 100644 test/Transforms/LowerTypeTests/export-nothing.ll create mode 100644 test/Transforms/LowerTypeTests/import-unsat.ll create mode 100644 utils/unittest/googletest/include/gtest/internal/custom/gtest-port.h create mode 100644 utils/unittest/googletest/include/gtest/internal/custom/gtest-printers.h create mode 100644 utils/unittest/googletest/include/gtest/internal/custom/gtest.h create mode 100644 utils/unittest/googletest/include/gtest/internal/gtest-port-arch.h diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake index 530a5ddaab4..d76f1293d02 100755 --- a/cmake/config-ix.cmake +++ b/cmake/config-ix.cmake @@ -167,7 +167,10 @@ check_symbol_exists(futimens sys/stat.h HAVE_FUTIMENS) check_symbol_exists(futimes sys/time.h HAVE_FUTIMES) check_symbol_exists(posix_fallocate fcntl.h HAVE_POSIX_FALLOCATE) # AddressSanitizer conflicts with lib/Support/Unix/Signals.inc -if( HAVE_SIGNAL_H AND NOT LLVM_USE_SANITIZER MATCHES ".*Address.*") +# Avoid sigaltstack on Apple platforms, where backtrace() cannot handle it +# (rdar://7089625) and _Unwind_Backtrace is unusable because it cannot unwind +# past the signal handler after an assertion failure (rdar://29866587). +if( HAVE_SIGNAL_H AND NOT LLVM_USE_SANITIZER MATCHES ".*Address.*" AND NOT APPLE ) check_symbol_exists(sigaltstack signal.h HAVE_SIGALTSTACK) endif() if( HAVE_SYS_UIO_H ) @@ -314,6 +317,8 @@ else() endif() check_cxx_compiler_flag("-Wno-variadic-macros" SUPPORTS_NO_VARIADIC_MACROS_FLAG) +check_cxx_compiler_flag("-Wno-gnu-zero-variadic-macro-arguments" + SUPPORTS_NO_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG) set(USE_NO_MAYBE_UNINITIALIZED 0) set(USE_NO_UNINITIALIZED 0) diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake index f35fcf444d2..fbef1d04eac 100755 --- a/cmake/modules/AddLLVM.cmake +++ b/cmake/modules/AddLLVM.cmake @@ -1014,6 +1014,10 @@ function(add_unittest test_suite test_name) if (SUPPORTS_NO_VARIADIC_MACROS_FLAG) list(APPEND LLVM_COMPILE_FLAGS "-Wno-variadic-macros") endif () + # Some parts of gtest rely on this GNU extension, don't warn on it. + if(SUPPORTS_NO_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG) + list(APPEND LLVM_COMPILE_FLAGS "-Wno-gnu-zero-variadic-macro-arguments") + endif() set(LLVM_REQUIRES_RTTI OFF) diff --git a/docs/CompileCudaWithLLVM.rst b/docs/CompileCudaWithLLVM.rst index af681aeead6..6ad8652cfc1 100644 --- a/docs/CompileCudaWithLLVM.rst +++ b/docs/CompileCudaWithLLVM.rst @@ -35,8 +35,8 @@ by many Linux package managers; you probably need to install nvidia's package. You will need CUDA 7.0, 7.5, or 8.0 to compile with clang. -CUDA compilation is supported on Linux, and on MacOS as of XXXX-XX-XX. Windows -support is planned but not yet in place. +CUDA compilation is supported on Linux, on MacOS as of 2016-11-18, and on +Windows as of 2017-01-05. Invoking clang -------------- diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst index 06a9c6af9b4..8d1984b65cd 100644 --- a/docs/Phabricator.rst +++ b/docs/Phabricator.rst @@ -132,7 +132,7 @@ committed to trunk. If you do not have commit access, someone has to commit the change for you (with attribution). It is sufficient to add a comment to the approved review indicating you cannot commit the patch yourself. If you have commit access, there are multiple workflows to commit the -change. Whichever method you follow it is recommend that your commit message +change. Whichever method you follow it is recommended that your commit message ends with the line: :: diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h index 54ef1a688d3..6fbe532112b 100644 --- a/include/llvm/Analysis/CGSCCPassManager.h +++ b/include/llvm/Analysis/CGSCCPassManager.h @@ -128,7 +128,7 @@ extern template class PassManager diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index d583614284f..b4a6c5c2fae 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -36,6 +36,8 @@ namespace llvm { class Function; class GlobalValue; class Loop; +class ScalarEvolution; +class SCEV; class Type; class User; class Value; @@ -613,10 +615,11 @@ class TargetTransformInfo { /// merged into the instruction indexing mode. Some targets might want to /// distinguish between address computation for memory operations on vector /// types and scalar types. Such targets should override this function. - /// The 'IsComplex' parameter is a hint that the address computation is likely - /// to involve multiple instructions and as such unlikely to be merged into - /// the address indexing mode. - int getAddressComputationCost(Type *Ty, bool IsComplex = false) const; + /// The 'SE' parameter holds pointer for the scalar evolution object which + /// is used in order to get the Ptr step value in case of constant stride. + /// The 'Ptr' parameter holds SCEV of the access pointer. + int getAddressComputationCost(Type *Ty, ScalarEvolution *SE = nullptr, + const SCEV *Ptr = nullptr) const; /// \returns The cost, if any, of keeping values of the given types alive /// over a callsite. @@ -795,7 +798,8 @@ class TargetTransformInfo::Concept { virtual int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; - virtual int getAddressComputationCost(Type *Ty, bool IsComplex) = 0; + virtual int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) = 0; virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) = 0; virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) = 0; @@ -1044,8 +1048,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getNumberOfParts(Type *Tp) override { return Impl.getNumberOfParts(Tp); } - int getAddressComputationCost(Type *Ty, bool IsComplex) override { - return Impl.getAddressComputationCost(Ty, IsComplex); + int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) override { + return Impl.getAddressComputationCost(Ty, SE, Ptr); } unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) override { return Impl.getCostOfKeepingLiveOverCall(Tys); diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 68b38a7fa53..1d7edbaf7df 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -15,6 +15,7 @@ #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFOIMPL_H #define LLVM_ANALYSIS_TARGETTRANSFORMINFOIMPL_H +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" @@ -370,7 +371,10 @@ class TargetTransformInfoImplBase { unsigned getNumberOfParts(Type *Tp) { return 0; } - unsigned getAddressComputationCost(Type *Tp, bool) { return 0; } + unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *, + const SCEV *) { + return 0; + } unsigned getReductionCost(unsigned, Type *, bool) { return 1; } @@ -422,6 +426,30 @@ class TargetTransformInfoImplBase { VectorType *VecTy) const { return VF; } +protected: + bool isStridedAccess(const SCEV *Ptr) { + return Ptr && isa(Ptr); + } + + const SCEVConstant *getConstantStrideStep(ScalarEvolution *SE, + const SCEV *Ptr) { + if (!isStridedAccess(Ptr)) + return nullptr; + const SCEVAddRecExpr *AddRec = cast(Ptr); + return dyn_cast(AddRec->getStepRecurrence(*SE)); + } + + bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, + int64_t MergeDistance) { + const SCEVConstant *Step = getConstantStrideStep(SE, Ptr); + if (!Step) + return false; + APInt StrideVal = Step->getAPInt(); + if (StrideVal.getBitWidth() > 64) + return false; + // FIXME: need to take absolute value for negtive stride case + return StrideVal.getSExtValue() < MergeDistance; + } }; /// \brief CRTP base class for use as a mix-in that aids implementing diff --git a/include/llvm/Bitcode/BitCodes.h b/include/llvm/Bitcode/BitCodes.h index cfc7a1d7d6b..bf21e146e77 100644 --- a/include/llvm/Bitcode/BitCodes.h +++ b/include/llvm/Bitcode/BitCodes.h @@ -18,7 +18,6 @@ #ifndef LLVM_BITCODE_BITCODES_H #define LLVM_BITCODE_BITCODES_H -#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/ErrorHandling.h" @@ -166,11 +165,8 @@ template <> struct isPodLike { static const bool value=true; }; /// BitCodeAbbrev - This class represents an abbreviation record. An /// abbreviation allows a complex record that has redundancy to be stored in a /// specialized format instead of the fully-general, fully-vbr, format. -class BitCodeAbbrev : public RefCountedBase { +class BitCodeAbbrev { SmallVector OperandList; - // Only RefCountedBase is allowed to delete. - ~BitCodeAbbrev() = default; - friend class RefCountedBase; public: unsigned getNumOperandInfos() const { diff --git a/include/llvm/Bitcode/BitstreamReader.h b/include/llvm/Bitcode/BitstreamReader.h index 4d95a6ce8a1..fc06eeefbf2 100644 --- a/include/llvm/Bitcode/BitstreamReader.h +++ b/include/llvm/Bitcode/BitstreamReader.h @@ -16,7 +16,6 @@ #define LLVM_BITCODE_BITSTREAMREADER_H #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Bitcode/BitCodes.h" #include "llvm/Support/Endian.h" @@ -42,7 +41,7 @@ class BitstreamBlockInfo { /// describe abbreviations that all blocks of the specified ID inherit. struct BlockInfo { unsigned BlockID; - std::vector> Abbrevs; + std::vector> Abbrevs; std::string Name; std::vector > RecordNames; }; @@ -316,11 +315,11 @@ class BitstreamCursor : SimpleBitstreamCursor { unsigned CurCodeSize = 2; /// Abbrevs installed at in this block. - std::vector> CurAbbrevs; + std::vector> CurAbbrevs; struct Block { unsigned PrevCodeSize; - std::vector> PrevAbbrevs; + std::vector> PrevAbbrevs; explicit Block(unsigned PCS) : PrevCodeSize(PCS) {} }; @@ -478,8 +477,8 @@ class BitstreamCursor : SimpleBitstreamCursor { return CurAbbrevs[AbbrevNo].get(); } - /// Read the current record and discard it. - void skipRecord(unsigned AbbrevID); + /// Read the current record and discard it, returning the code for the record. + unsigned skipRecord(unsigned AbbrevID); unsigned readRecord(unsigned AbbrevID, SmallVectorImpl &Vals, StringRef *Blob = nullptr); diff --git a/include/llvm/Bitcode/BitstreamWriter.h b/include/llvm/Bitcode/BitstreamWriter.h index 8eb6e8aef7a..e276db5f92f 100644 --- a/include/llvm/Bitcode/BitstreamWriter.h +++ b/include/llvm/Bitcode/BitstreamWriter.h @@ -43,12 +43,12 @@ class BitstreamWriter { unsigned BlockInfoCurBID; /// CurAbbrevs - Abbrevs installed at in this block. - std::vector> CurAbbrevs; + std::vector> CurAbbrevs; struct Block { unsigned PrevCodeSize; size_t StartSizeWord; - std::vector> PrevAbbrevs; + std::vector> PrevAbbrevs; Block(unsigned PCS, size_t SSW) : PrevCodeSize(PCS), StartSizeWord(SSW) {} }; @@ -59,7 +59,7 @@ class BitstreamWriter { /// These describe abbreviations that all blocks of the specified ID inherit. struct BlockInfo { unsigned BlockID; - std::vector> Abbrevs; + std::vector> Abbrevs; }; std::vector BlockInfoRecords; @@ -469,12 +469,12 @@ class BitstreamWriter { private: // Emit the abbreviation as a DEFINE_ABBREV record. - void EncodeAbbrev(BitCodeAbbrev *Abbv) { + void EncodeAbbrev(const BitCodeAbbrev &Abbv) { EmitCode(bitc::DEFINE_ABBREV); - EmitVBR(Abbv->getNumOperandInfos(), 5); - for (unsigned i = 0, e = static_cast(Abbv->getNumOperandInfos()); + EmitVBR(Abbv.getNumOperandInfos(), 5); + for (unsigned i = 0, e = static_cast(Abbv.getNumOperandInfos()); i != e; ++i) { - const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i); + const BitCodeAbbrevOp &Op = Abbv.getOperandInfo(i); Emit(Op.isLiteral(), 1); if (Op.isLiteral()) { EmitVBR64(Op.getLiteralValue(), 8); @@ -489,10 +489,10 @@ class BitstreamWriter { /// EmitAbbrev - This emits an abbreviation to the stream. Note that this /// method takes ownership of the specified abbrev. - unsigned EmitAbbrev(BitCodeAbbrev *Abbv) { + unsigned EmitAbbrev(std::shared_ptr Abbv) { // Emit the abbreviation as a record. - EncodeAbbrev(Abbv); - CurAbbrevs.push_back(Abbv); + EncodeAbbrev(*Abbv); + CurAbbrevs.push_back(std::move(Abbv)); return static_cast(CurAbbrevs.size())-1 + bitc::FIRST_APPLICATION_ABBREV; } @@ -532,13 +532,13 @@ class BitstreamWriter { /// EmitBlockInfoAbbrev - Emit a DEFINE_ABBREV record for the specified /// BlockID. - unsigned EmitBlockInfoAbbrev(unsigned BlockID, BitCodeAbbrev *Abbv) { + unsigned EmitBlockInfoAbbrev(unsigned BlockID, std::shared_ptr Abbv) { SwitchToBlockID(BlockID); - EncodeAbbrev(Abbv); + EncodeAbbrev(*Abbv); // Add the abbrev to the specified block record. BlockInfo &Info = getOrCreateBlockInfo(BlockID); - Info.Abbrevs.push_back(Abbv); + Info.Abbrevs.push_back(std::move(Abbv)); return Info.Abbrevs.size()-1+bitc::FIRST_APPLICATION_ABBREV; } diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h index be8822df3db..f0be955110f 100644 --- a/include/llvm/CodeGen/AsmPrinter.h +++ b/include/llvm/CodeGen/AsmPrinter.h @@ -140,6 +140,9 @@ class AsmPrinter : public MachineFunctionPass { /// If the target supports dwarf debug info, this pointer is non-null. DwarfDebug *DD; + /// If the current module uses dwarf CFI annotations strictly for debugging. + bool isCFIMoveForDebugging; + protected: explicit AsmPrinter(TargetMachine &TM, std::unique_ptr Streamer); @@ -262,6 +265,10 @@ class AsmPrinter : public MachineFunctionPass { enum CFIMoveType { CFI_M_None, CFI_M_EH, CFI_M_Debug }; CFIMoveType needsCFIMoves(); + /// Returns false if needsCFIMoves() == CFI_M_EH for any function + /// in the module. + bool needsOnlyDebugCFIMoves() const { return isCFIMoveForDebugging; } + bool needsSEHMoves(); /// Print to the current output stream assembly representations of the diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index df0dc1a38ae..8e96336b981 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -925,7 +925,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return LT.first; } - unsigned getAddressComputationCost(Type *Ty, bool IsComplex) { return 0; } + unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, + const SCEV *) { + return 0; + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) { assert(Ty->isVectorTy() && "Expect a vector type"); diff --git a/include/llvm/CodeGen/DIE.h b/include/llvm/CodeGen/DIE.h index 1e3476cd839..09c3bf6a1b5 100644 --- a/include/llvm/CodeGen/DIE.h +++ b/include/llvm/CodeGen/DIE.h @@ -651,6 +651,9 @@ class DIE : IntrusiveBackListNode, public DIEValueList { unsigned AbbrevNumber = ~0u; /// Dwarf tag code. dwarf::Tag Tag = (dwarf::Tag)0; + /// Set to true to force a DIE to emit an abbreviation that says it has + /// children even when it doesn't. This is used for unit testing purposes. + bool ForceChildren; /// Children DIEs. IntrusiveBackList Children; @@ -659,7 +662,8 @@ class DIE : IntrusiveBackListNode, public DIEValueList { PointerUnion Owner; DIE() = delete; - explicit DIE(dwarf::Tag Tag) : Offset(0), Size(0), Tag(Tag) {} + explicit DIE(dwarf::Tag Tag) : Offset(0), Size(0), Tag(Tag), + ForceChildren(false) {} public: static DIE *get(BumpPtrAllocator &Alloc, dwarf::Tag Tag) { @@ -677,7 +681,8 @@ class DIE : IntrusiveBackListNode, public DIEValueList { /// Get the compile/type unit relative offset of this DIE. unsigned getOffset() const { return Offset; } unsigned getSize() const { return Size; } - bool hasChildren() const { return !Children.empty(); } + bool hasChildren() const { return ForceChildren || !Children.empty(); } + void setForceChildren(bool B) { ForceChildren = B; } typedef IntrusiveBackList::iterator child_iterator; typedef IntrusiveBackList::const_iterator const_child_iterator; diff --git a/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/include/llvm/CodeGen/GlobalISel/IRTranslator.h index 76e0d47ceea..26ba5c67beb 100644 --- a/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -180,6 +180,8 @@ class IRTranslator : public MachineFunctionPass { /// \pre \p U is a branch instruction. bool translateBr(const User &U, MachineIRBuilder &MIRBuilder); + bool translateSwitch(const User &U, MachineIRBuilder &MIRBuilder); + bool translateExtractValue(const User &U, MachineIRBuilder &MIRBuilder); bool translateInsertValue(const User &U, MachineIRBuilder &MIRBuilder); @@ -292,12 +294,8 @@ class IRTranslator : public MachineFunctionPass { return translateBinaryOp(TargetOpcode::G_FREM, U, MIRBuilder); } - // Stubs to keep the compiler happy while we implement the rest of the // translation. - bool translateSwitch(const User &U, MachineIRBuilder &MIRBuilder) { - return false; - } bool translateIndirectBr(const User &U, MachineIRBuilder &MIRBuilder) { return false; } diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index be811c6fe43..92a9896d7a1 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -308,7 +308,7 @@ class MachineBasicBlock // Iteration support for live in sets. These sets are kept in sorted // order by their register number. typedef LiveInVector::const_iterator livein_iterator; - livein_iterator livein_begin() const { return LiveIns.begin(); } + livein_iterator livein_begin() const; livein_iterator livein_end() const { return LiveIns.end(); } bool livein_empty() const { return LiveIns.empty(); } iterator_range liveins() const { diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h index f33758de6a5..5a24b7c8729 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDie.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h @@ -10,6 +10,8 @@ #ifndef LLVM_LIB_DEBUGINFO_DWARFDIE_H #define LLVM_LIB_DEBUGINFO_DWARFDIE_H +#include "llvm/ADT/iterator.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/ADT/Optional.h" #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h" @@ -40,9 +42,6 @@ class DWARFDie { bool isValid() const { return U && Die; } explicit operator bool() const { return isValid(); } - bool operator ==(const DWARFDie &RHS) const { - return Die == RHS.Die && U == RHS.U; - } const DWARFDebugInfoEntry *getDebugInfoEntry() const { return Die; } DWARFUnit *getDwarfUnit() const { return U; } @@ -361,8 +360,61 @@ class DWARFDie { getInlinedChainForAddress(const uint64_t Address, SmallVectorImpl &InlinedChain) const; + class iterator; + + iterator begin() const; + iterator end() const; + iterator_range children() const; }; + +inline bool operator==(const DWARFDie &LHS, const DWARFDie &RHS) { + return LHS.getDebugInfoEntry() == RHS.getDebugInfoEntry() && + LHS.getDwarfUnit() == RHS.getDwarfUnit(); +} + +inline bool operator!=(const DWARFDie &LHS, const DWARFDie &RHS) { + return !(LHS == RHS); +} + +class DWARFDie::iterator : public iterator_facade_base { + DWARFDie Die; + void skipNull() { + if (Die && Die.isNULL()) + Die = DWARFDie(); + } +public: + iterator() = default; + explicit iterator(DWARFDie D) : Die(D) { + // If we start out with only a Null DIE then invalidate. + skipNull(); + } + iterator &operator++() { + Die = Die.getSibling(); + // Don't include the NULL die when iterating. + skipNull(); + return *this; + } + explicit operator bool() const { return Die.isValid(); } + const DWARFDie &operator*() const { return Die; } + bool operator==(const iterator &X) const { return Die == X.Die; } +}; + +// These inline functions must follow the DWARFDie::iterator definition above +// as they use functions from that class. +inline DWARFDie::iterator DWARFDie::begin() const { + return iterator(getFirstChild()); +} + +inline DWARFDie::iterator DWARFDie::end() const { + return iterator(); +} + +inline iterator_range DWARFDie::children() const { + return make_range(begin(), end()); +} } // end namespace llvm diff --git a/include/llvm/ExecutionEngine/Orc/RawByteChannel.h b/include/llvm/ExecutionEngine/Orc/RawByteChannel.h index 43b597de000..83a7b9a844f 100644 --- a/include/llvm/ExecutionEngine/Orc/RawByteChannel.h +++ b/include/llvm/ExecutionEngine/Orc/RawByteChannel.h @@ -47,9 +47,9 @@ class RawByteChannel { /// Locks the channel for writing. template Error startSendMessage(const FunctionIdT &FnId, const SequenceIdT &SeqNo) { + writeLock.lock(); if (auto Err = serializeSeq(*this, FnId, SeqNo)) return Err; - writeLock.lock(); return Error::success(); } diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h index 2cfe673d970..ecb0435a1e1 100644 --- a/include/llvm/IR/ModuleSummaryIndex.h +++ b/include/llvm/IR/ModuleSummaryIndex.h @@ -28,6 +28,10 @@ namespace llvm { +namespace yaml { +template struct MappingTraits; +} + /// \brief Class to accumulate and hold information about a callee. struct CalleeInfo { enum class HotnessType : uint8_t { Unknown = 0, Cold = 1, None = 2, Hot = 3 }; @@ -102,7 +106,7 @@ class GlobalValueSummary { /// \brief Sububclass discriminator (for dyn_cast<> et al.) enum SummaryKind : unsigned { AliasKind, FunctionKind, GlobalVarKind }; - /// Group flags (Linkage, noRename, isOptSize, etc.) as a bitfield. + /// Group flags (Linkage, NotEligibleToImport, etc.) as a bitfield. struct GVFlags { /// \brief The linkage type of the associated global value. /// @@ -113,39 +117,20 @@ class GlobalValueSummary { /// types based on global summary-based analysis. unsigned Linkage : 4; - /// Indicate if the global value cannot be renamed (in a specific section, - /// possibly referenced from inline assembly, etc). - unsigned NoRename : 1; - - /// Indicate if a function contains inline assembly (which is opaque), - /// that may reference a local value. This is used to prevent importing - /// of this function, since we can't promote and rename the uses of the - /// local in the inline assembly. Use a flag rather than bloating the - /// summary with references to every possible local value in the - /// llvm.used set. - unsigned HasInlineAsmMaybeReferencingInternal : 1; + /// Indicate if the global value cannot be imported (e.g. it cannot + /// be renamed or references something that can't be renamed). + unsigned NotEligibleToImport : 1; - /// Indicate if the function is not viable to inline. - unsigned IsNotViableToInline : 1; + /// Indicate that the global value must be considered a live root for + /// index-based liveness analysis. Used for special LLVM values such as + /// llvm.global_ctors that the linker does not know about. + unsigned LiveRoot : 1; /// Convenience Constructors - explicit GVFlags(GlobalValue::LinkageTypes Linkage, bool NoRename, - bool HasInlineAsmMaybeReferencingInternal, - bool IsNotViableToInline) - : Linkage(Linkage), NoRename(NoRename), - HasInlineAsmMaybeReferencingInternal( - HasInlineAsmMaybeReferencingInternal), - IsNotViableToInline(IsNotViableToInline) {} - - GVFlags(const GlobalValue &GV) - : Linkage(GV.getLinkage()), NoRename(GV.hasSection()), - HasInlineAsmMaybeReferencingInternal(false) { - IsNotViableToInline = false; - if (const auto *F = dyn_cast(&GV)) - // Inliner doesn't handle variadic functions. - // FIXME: refactor this to use the same code that inliner is using. - IsNotViableToInline = F->isVarArg(); - } + explicit GVFlags(GlobalValue::LinkageTypes Linkage, + bool NotEligibleToImport, bool LiveRoot) + : Linkage(Linkage), NotEligibleToImport(NotEligibleToImport), + LiveRoot(LiveRoot) {} }; private: @@ -213,31 +198,19 @@ class GlobalValueSummary { Flags.Linkage = Linkage; } - bool isNotViableToInline() const { return Flags.IsNotViableToInline; } - - /// Return true if this summary is for a GlobalValue that needs promotion - /// to be referenced from another module. - bool needsRenaming() const { return GlobalValue::isLocalLinkage(linkage()); } + /// Return true if this global value can't be imported. + bool notEligibleToImport() const { return Flags.NotEligibleToImport; } - /// Return true if this global value cannot be renamed (in a specific section, - /// possibly referenced from inline assembly, etc). - bool noRename() const { return Flags.NoRename; } + /// Return true if this global value must be considered a root for live + /// value analysis on the index. + bool liveRoot() const { return Flags.LiveRoot; } - /// Flag that this global value cannot be renamed (in a specific section, - /// possibly referenced from inline assembly, etc). - void setNoRename() { Flags.NoRename = true; } + /// Flag that this global value must be considered a root for live + /// value analysis on the index. + void setLiveRoot() { Flags.LiveRoot = true; } - /// Return true if this global value possibly references another value - /// that can't be renamed. - bool hasInlineAsmMaybeReferencingInternal() const { - return Flags.HasInlineAsmMaybeReferencingInternal; - } - - /// Flag that this global value possibly references another value that - /// can't be renamed. - void setHasInlineAsmMaybeReferencingInternal() { - Flags.HasInlineAsmMaybeReferencingInternal = true; - } + /// Flag that this global value cannot be imported. + void setNotEligibleToImport() { Flags.NotEligibleToImport = true; } /// Return the list of values referenced by this global value definition. ArrayRef refs() const { return RefEdgeList; } @@ -330,6 +303,30 @@ class GlobalVarSummary : public GlobalValueSummary { } }; +struct TypeTestResolution { + /// Specifies which kind of type check we should emit for this byte array. + /// See http://clang.llvm.org/docs/ControlFlowIntegrityDesign.html for full + /// details on each kind of check; the enumerators are described with + /// reference to that document. + enum Kind { + Unsat, ///< Unsatisfiable type (i.e. no global has this type metadata) + ByteArray, ///< Test a byte array (first example) + Inline, ///< Inlined bit vector ("Short Inline Bit Vectors") + Single, ///< Single element (last example in "Short Inline Bit Vectors") + AllOnes, ///< All-ones bit vector ("Eliminating Bit Vector Checks for + /// All-Ones Bit Vectors") + } TheKind = Unsat; + + /// Range of the size expressed as a bit width. For example, if the size is in + /// range [0,256), this number will be 8. This helps generate the most compact + /// instruction sequences. + unsigned SizeBitWidth = 0; +}; + +struct TypeIdSummary { + TypeTestResolution TTRes; +}; + /// 160 bits SHA1 typedef std::array ModuleHash; @@ -370,11 +367,20 @@ class ModuleSummaryIndex { /// Holds strings for combined index, mapping to the corresponding module ID. ModulePathStringTableTy ModulePathStringTable; + /// Mapping from type identifiers to summary information for that type + /// identifier. + // FIXME: Add bitcode read/write support for this field. + std::map TypeIdMap; + + // YAML I/O support. + friend yaml::MappingTraits; + public: gvsummary_iterator begin() { return GlobalValueMap.begin(); } const_gvsummary_iterator begin() const { return GlobalValueMap.begin(); } gvsummary_iterator end() { return GlobalValueMap.end(); } const_gvsummary_iterator end() const { return GlobalValueMap.end(); } + size_t size() const { return GlobalValueMap.size(); } /// Get the list of global value summary objects for a given value name. const GlobalValueSummaryList &getGlobalValueSummaryList(StringRef ValueName) { diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h new file mode 100644 index 00000000000..a8c8ff9ef2e --- /dev/null +++ b/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -0,0 +1,111 @@ +//===-- llvm/ModuleSummaryIndexYAML.h - YAML I/O for summary ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_MODULESUMMARYINDEXYAML_H +#define LLVM_IR_MODULESUMMARYINDEXYAML_H + +#include "llvm/IR/ModuleSummaryIndex.h" +#include "llvm/Support/YAMLTraits.h" + +namespace llvm { +namespace yaml { + +template <> struct ScalarEnumerationTraits { + static void enumeration(IO &io, TypeTestResolution::Kind &value) { + io.enumCase(value, "Unsat", TypeTestResolution::Unsat); + io.enumCase(value, "ByteArray", TypeTestResolution::ByteArray); + io.enumCase(value, "Inline", TypeTestResolution::Inline); + io.enumCase(value, "Single", TypeTestResolution::Single); + io.enumCase(value, "AllOnes", TypeTestResolution::AllOnes); + } +}; + +template <> struct MappingTraits { + static void mapping(IO &io, TypeTestResolution &res) { + io.mapRequired("Kind", res.TheKind); + io.mapRequired("SizeBitWidth", res.SizeBitWidth); + } +}; + +template <> struct MappingTraits { + static void mapping(IO &io, TypeIdSummary& summary) { + io.mapRequired("TTRes", summary.TTRes); + } +}; + +struct FunctionSummaryYaml { + std::vector TypeTests; +}; + +} // End yaml namespace +} // End llvm namespace + +LLVM_YAML_IS_SEQUENCE_VECTOR(uint64_t) + +namespace llvm { +namespace yaml { + +template <> struct MappingTraits { + static void mapping(IO &io, FunctionSummaryYaml& summary) { + io.mapRequired("TypeTests", summary.TypeTests); + } +}; + +} // End yaml namespace +} // End llvm namespace + +LLVM_YAML_IS_STRING_MAP(TypeIdSummary) +LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionSummaryYaml) + +namespace llvm { +namespace yaml { + +// FIXME: Add YAML mappings for the rest of the module summary. +template <> struct CustomMappingTraits { + static void inputOne(IO &io, StringRef Key, GlobalValueSummaryMapTy &V) { + std::vector FSums; + io.mapRequired(Key.str().c_str(), FSums); + uint64_t KeyInt; + if (Key.getAsInteger(0, KeyInt)) { + io.setError("key not an integer"); + return; + } + auto &Elem = V[KeyInt]; + for (auto &FSum : FSums) { + GlobalValueSummary::GVFlags GVFlags(GlobalValue::ExternalLinkage, false, + false); + Elem.push_back(llvm::make_unique( + GVFlags, 0, ArrayRef{}, + ArrayRef{}, std::move(FSum.TypeTests))); + } + } + static void output(IO &io, GlobalValueSummaryMapTy &V) { + for (auto &P : V) { + std::vector FSums; + for (auto &Sum : P.second) { + if (auto *FSum = dyn_cast(Sum.get())) + FSums.push_back(FunctionSummaryYaml{FSum->type_tests()}); + } + if (!FSums.empty()) + io.mapRequired(llvm::utostr(P.first).c_str(), FSums); + } + } +}; + +template <> struct MappingTraits { + static void mapping(IO &io, ModuleSummaryIndex& index) { + io.mapRequired("GlobalValueMap", index.GlobalValueMap); + io.mapRequired("TypeIdMap", index.TypeIdMap); + } +}; + +} // End yaml namespace +} // End llvm namespace + +#endif diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h index 3e4edd893d3..7a63956f1cd 100644 --- a/include/llvm/IR/PassManager.h +++ b/include/llvm/IR/PassManager.h @@ -64,32 +64,31 @@ namespace llvm { struct alignas(8) AnalysisKey {}; /// A special type used to provide an address that identifies a set of related -/// analyses. +/// analyses. These sets are primarily used below to mark sets of analyses as +/// preserved. /// -/// These sets are primarily used below to mark sets of analyses as preserved. -/// An example would be analyses depending only on the CFG of a function. -/// A transformation can mark that it is preserving the CFG of a function and -/// then analyses can check for this rather than each transform having to fully -/// enumerate every analysis preserved. +/// For example, a transformation can indicate that it preserves the CFG of a +/// function by preserving the appropriate AnalysisSetKey. An analysis that +/// depends only on the CFG can then check if that AnalysisSetKey is preserved; +/// if it is, the analysis knows that it itself is preserved. struct alignas(8) AnalysisSetKey {}; -/// Class for tracking what analyses are preserved after a transformation pass -/// runs over some unit of IR. +/// A set of analyses that are preserved following a run of a transformation +/// pass. /// -/// Transformation passes build and return these objects when run over the IR -/// to communicate which analyses remain valid afterward. For most passes this -/// is fairly simple: if they don't change anything all analyses are preserved, +/// Transformation passes build and return these objects to communicate which +/// analyses are still valid after the transformation. For most passes this is +/// fairly simple: if they don't change anything all analyses are preserved, /// otherwise only a short list of analyses that have been explicitly updated /// are preserved. /// -/// This class also provides the ability to mark abstract *sets* of analyses as -/// preserved. These sets allow passes to indicate that they preserve broad -/// aspects of the IR (such as its CFG) and analyses to opt in to that being -/// sufficient without the passes having to fully enumerate such analyses. +/// This class also lets transformation passes mark abstract *sets* of analyses +/// as preserved. A transformation that (say) does not alter the CFG can +/// indicate such by marking a particular AnalysisSetKey as preserved, and +/// then analyses can query whether that AnalysisSetKey is preserved. /// -/// Finally, this class can represent "abandoning" an analysis, which marks it -/// as not-preserved even if it would be covered by some abstract set of -/// analyses. +/// Finally, this class can represent an "abandoned" analysis, which is +/// not preserved even if it would be covered by some abstract set of analyses. /// /// Given a `PreservedAnalyses` object, an analysis will typically want to /// figure out whether it is preserved. In the example below, MyAnalysisType is @@ -120,7 +119,8 @@ class PreservedAnalyses { /// Mark an analysis as preserved. template void preserve() { preserve(AnalysisT::ID()); } - /// Mark an analysis as preserved using its ID. + /// \brief Given an analysis's ID, mark the analysis as preserved, adding it + /// to the set. void preserve(AnalysisKey *ID) { // Clear this ID from the explicit not-preserved set if present. NotPreservedAnalysisIDs.erase(ID); @@ -224,17 +224,17 @@ class PreservedAnalyses { : PA(PA), ID(ID), IsAbandoned(PA.NotPreservedAnalysisIDs.count(ID)) {} public: - /// Returns true if the checker's analysis was not abandoned and the - /// analysis is either is explicitly preserved or all analyses are - /// preserved. + /// Returns true if the checker's analysis was not abandoned and either + /// - the analysis is explicitly preserved or + /// - all analyses are preserved. bool preserved() { return !IsAbandoned && (PA.PreservedIDs.count(&AllAnalysesKey) || PA.PreservedIDs.count(ID)); } - /// Returns true if the checker's analysis was not abandoned and either the - /// provided set type is either explicitly preserved or all analyses are - /// preserved. + /// Returns true if the checker's analysis was not abandoned and either + /// - \p AnalysisSetT is explicitly preserved or + /// - all analyses are preserved. template bool preservedSet() { AnalysisSetKey *SetID = AnalysisSetT::ID(); return !IsAbandoned && (PA.PreservedIDs.count(&AllAnalysesKey) || @@ -262,8 +262,8 @@ class PreservedAnalyses { /// Test whether all analyses are preserved (and none are abandoned). /// - /// This lets analyses optimize for the common case where a transformation - /// made no changes to the IR. + /// This is used primarily to optimize for the common case of a transformation + /// which makes no changes to the IR. bool areAllPreserved() const { return NotPreservedAnalysisIDs.empty() && PreservedIDs.count(&AllAnalysesKey); @@ -307,9 +307,9 @@ template class AnalysisManager; /// A CRTP mix-in to automatically provide informational APIs needed for /// passes. /// -/// This provides some boiler plate for types that are passes. +/// This provides some boilerplate for types that are passes. template struct PassInfoMixin { - /// Returns the name of the derived pass type. + /// Gets the name of the pass we are mixed into. static StringRef name() { StringRef Name = getTypeName(); if (Name.startswith("llvm::")) @@ -318,41 +318,35 @@ template struct PassInfoMixin { } }; -/// A CRTP mix-in to automatically provide informational APIs needed for -/// analysis passes. +/// A CRTP mix-in that provides informational APIs needed for analysis passes. /// -/// This provides some boiler plate for types that are analysis passes. It -/// automatically mixes in \c PassInfoMixin and adds informational APIs -/// specifically used for analyses. +/// This provides some boilerplate for types that are analysis passes. It +/// automatically mixes in \c PassInfoMixin. template struct AnalysisInfoMixin : PassInfoMixin { /// Returns an opaque, unique ID for this analysis type. /// - /// This ID is a pointer type that is guaranteed to be 8-byte aligned and - /// thus suitable for use in sets, maps, and other data structures optimized - /// for pointer-like types using the alignment-provided low bits. + /// This ID is a pointer type that is guaranteed to be 8-byte aligned and thus + /// suitable for use in sets, maps, and other data structures that use the low + /// bits of pointers. /// /// Note that this requires the derived type provide a static \c AnalysisKey /// member called \c Key. /// - /// FIXME: The only reason the derived type needs to provide this rather than - /// this mixin providing it is due to broken implementations which cannot - /// correctly unique a templated static so that they have the same addresses - /// for each instantiation and are definitively emitted once for each - /// instantiation. The only currently known platform with this limitation are - /// Windows DLL builds, specifically building each part of LLVM as a DLL. If - /// we ever remove that build configuration, this mixin can provide the - /// static key as well. + /// FIXME: The only reason the mixin type itself can't declare the Key value + /// is that some compilers cannot correctly unique a templated static variable + /// so it has the same addresses in each instantiation. The only currently + /// known platform with this limitation is Windows DLL builds, specifically + /// building each part of LLVM as a DLL. If we ever remove that build + /// configuration, this mixin can provide the static key as well. static AnalysisKey *ID() { return &DerivedT::Key; } }; -/// A class template to provide analysis sets for IR units. +/// This templated class represents "all analyses that operate over \" (e.g. a Function or a Module) in instances of +/// PreservedAnalysis. /// -/// Analyses operate on units of IR. It is useful to be able to talk about -/// preservation of all analyses for a given unit of IR as a set. This class -/// template can be used with the \c PreservedAnalyses API for that purpose and -/// the \c AnalysisManager will automatically check and use this set to skip -/// invalidation events. +/// This lets a transformation say e.g. "I preserved all function analyses". /// /// Note that you must provide an explicit instantiation declaration and /// definition for this template in order to get the correct behavior on @@ -371,17 +365,18 @@ template AnalysisSetKey AllAnalysesOn::SetKey; extern template class AllAnalysesOn; extern template class AllAnalysesOn; -/// \brief Manages a sequence of passes over units of IR. +/// \brief Manages a sequence of passes over a particular unit of IR. /// -/// A pass manager contains a sequence of passes to run over units of IR. It is -/// itself a valid pass over that unit of IR, and when over some given IR will -/// run each pass in sequence. This is the primary and most basic building -/// block of a pass pipeline. +/// A pass manager contains a sequence of passes to run over a particular unit +/// of IR (e.g. Functions, Modules). It is itself a valid pass over that unit of +/// IR, and when run over some given IR will run each of its contained passes in +/// sequence. Pass managers are the primary and most basic building block of a +/// pass pipeline. /// -/// If it is run with an \c AnalysisManager argument, it will propagate -/// that analysis manager to each pass it runs, as well as calling the analysis -/// manager's invalidation routine with the PreservedAnalyses of each pass it -/// runs. +/// When you run a pass manager, you provide an \c AnalysisManager +/// argument. The pass manager will propagate that analysis manager to each +/// pass it runs, and will call the analysis manager's invalidation routine with +/// the PreservedAnalyses of each pass it runs. template , typename... ExtraArgTs> @@ -390,7 +385,7 @@ class PassManager : public PassInfoMixin< public: /// \brief Construct a pass manager. /// - /// It can be passed a flag to get debug logging as the passes are run. + /// If \p DebugLogging is true, we'll log our progress to llvm::dbgs(). explicit PassManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {} // FIXME: These are equivalent to the default move constructor/move @@ -400,13 +395,15 @@ class PassManager : public PassInfoMixin< PassManager(PassManager &&Arg) : Passes(std::move(Arg.Passes)), DebugLogging(std::move(Arg.DebugLogging)) {} + PassManager &operator=(PassManager &&RHS) { Passes = std::move(RHS.Passes); DebugLogging = std::move(RHS.DebugLogging); return *this; } - /// \brief Run all of the passes in this manager over the IR. + /// \brief Run all of the passes in this manager over the given unit of IR. + /// ExtraArgs are passed to each pass. PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM, ExtraArgTs... ExtraArgs) { PreservedAnalyses PA = PreservedAnalyses::all(); @@ -425,7 +422,7 @@ class PassManager : public PassInfoMixin< // invalidates analyses. AM.invalidate(IR, PassPA); - // Finally, we intersect the preserved analyses to compute the aggregate + // Finally, intersect the preserved analyses to compute the aggregate // preserved set for this pass manager. PA.intersect(std::move(PassPA)); @@ -473,30 +470,29 @@ extern template class PassManager; /// \brief Convenience typedef for a pass manager over functions. typedef PassManager FunctionPassManager; -/// \brief A generic analysis pass manager with lazy running and caching of +/// \brief A container for analyses that lazily runs them and caches their /// results. /// -/// This analysis manager can be used for any IR unit where the address of the -/// IR unit sufficies as its identity. It manages the cache for a unit of IR via -/// the address of each unit of IR cached. +/// This class can manage analyses for any IR unit where the address of the IR +/// unit sufficies as its identity. template class AnalysisManager { public: class Invalidator; private: - // Now that we've defined our invalidator, we can build types for the concept - // types. + // Now that we've defined our invalidator, we can define the concept types. typedef detail::AnalysisResultConcept ResultConceptT; typedef detail::AnalysisPassConcept PassConceptT; - /// \brief List of function analysis pass IDs and associated concept pointers. + /// \brief List of analysis pass IDs and associated concept pointers. /// /// Requires iterators to be valid across appending new entries and arbitrary - /// erases. Provides the analysis ID to enable finding iterators to a given entry - /// in maps below, and provides the storage for the actual result concept. + /// erases. Provides the analysis ID to enable finding iterators to a given + /// entry in maps below, and provides the storage for the actual result + /// concept. typedef std::list>> AnalysisResultListT; @@ -504,8 +500,8 @@ template class AnalysisManager { typedef DenseMap AnalysisResultListMapT; /// \brief Map type from a pair of analysis ID and IRUnitT pointer to an - /// iterator into a particular result list which is where the actual result - /// is stored. + /// iterator into a particular result list (which is where the actual analysis + /// result is stored). typedef DenseMap, typename AnalysisResultListT::iterator> AnalysisResultMapT; @@ -515,28 +511,28 @@ template class AnalysisManager { /// /// When an analysis result embeds handles to other analysis results, it /// needs to be invalidated both when its own information isn't preserved and - /// if any of those embedded analysis results end up invalidated. We pass in - /// an \c Invalidator object from the analysis manager in order to let the - /// analysis results themselves define the dependency graph on the fly. This - /// avoids building an explicit data structure representation of the + /// when any of its embedded analysis results end up invalidated. We pass an + /// \c Invalidator object as an argument to \c invalidate() in order to let + /// the analysis results themselves define the dependency graph on the fly. + /// This lets us avoid building building an explicit representation of the /// dependencies between analysis results. class Invalidator { public: /// Trigger the invalidation of some other analysis pass if not already - /// handled and return whether it will in fact be invalidated. + /// handled and return whether it was in fact invalidated. /// /// This is expected to be called from within a given analysis result's \c /// invalidate method to trigger a depth-first walk of all inter-analysis /// dependencies. The same \p IR unit and \p PA passed to that result's \c /// invalidate method should in turn be provided to this routine. /// - /// The first time this is called for a given analysis pass, it will - /// trigger the corresponding result's \c invalidate method to be called. - /// Subsequent calls will use a cache of the results of that initial call. - /// It is an error to form cyclic dependencies between analysis results. + /// The first time this is called for a given analysis pass, it will call + /// the corresponding result's \c invalidate method. Subsequent calls will + /// use a cache of the results of that initial call. It is an error to form + /// cyclic dependencies between analysis results. /// - /// This returns true if the given analysis pass's result is invalid and - /// any dependecies on it will become invalid as a result. + /// This returns true if the given analysis's result is invalid. Any + /// dependecies on it will become invalid as a result. template bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA) { typedef detail::AnalysisResultModel class AnalysisManager { auto &Result = static_cast(*RI->second->second); - // Insert into the map whether the result should be invalidated and - // return that. Note that we cannot re-use IMapI and must do a fresh - // insert here as calling the invalidate routine could (recursively) - // insert things into the map making any iterator or reference invalid. + // Insert into the map whether the result should be invalidated and return + // that. Note that we cannot reuse IMapI and must do a fresh insert here, + // as calling invalidate could (recursively) insert things into the map, + // making any iterator or reference invalid. bool Inserted; std::tie(IMapI, Inserted) = IsResultInvalidated.insert({ID, Result.invalidate(IR, PA, *this)}); @@ -600,8 +596,7 @@ template class AnalysisManager { /// \brief Construct an empty analysis manager. /// - /// A flag can be passed to indicate that the manager should perform debug - /// logging. + /// If \p DebugLogging is true, we'll log our progress to llvm::dbgs(). AnalysisManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {} AnalysisManager(AnalysisManager &&) = default; AnalysisManager &operator=(AnalysisManager &&) = default; @@ -614,11 +609,11 @@ template class AnalysisManager { return AnalysisResults.empty(); } - /// \brief Clear any results for a single unit of IR. + /// \brief Clear any cached analysis results for a single unit of IR. /// - /// This doesn't invalidate but directly clears the results. It is useful - /// when the IR is being removed and we want to clear out all the memory - /// pinned for it. + /// This doesn't invalidate, but instead simply deletes, the relevant results. + /// It is useful when the IR is being removed and we want to clear out all the + /// memory pinned for it. void clear(IRUnitT &IR) { if (DebugLogging) dbgs() << "Clearing all analysis results for: " << IR.getName() << "\n"; @@ -626,7 +621,7 @@ template class AnalysisManager { auto ResultsListI = AnalysisResultLists.find(&IR); if (ResultsListI == AnalysisResultLists.end()) return; - // Clear the map pointing into the results list. + // Delete the map entries that point into the results list. for (auto &IDAndResult : ResultsListI->second) AnalysisResults.erase({IDAndResult.first, &IR}); @@ -634,21 +629,20 @@ template class AnalysisManager { AnalysisResultLists.erase(ResultsListI); } - /// \brief Clear the analysis result cache. + /// \brief Clear all analysis results cached by this AnalysisManager. /// - /// This routine allows cleaning up when the set of IR units itself has - /// potentially changed, and thus we can't even look up a a result and - /// invalidate it directly. Notably, this does *not* call invalidate - /// functions as there is nothing to be done for them. + /// Like \c clear(IRUnitT&), this doesn't invalidate the results; it simply + /// deletes them. This lets you clean up the AnalysisManager when the set of + /// IR units itself has potentially changed, and thus we can't even look up a + /// a result and invalidate/clear it directly. void clear() { AnalysisResults.clear(); AnalysisResultLists.clear(); } - /// \brief Get the result of an analysis pass for this module. + /// \brief Get the result of an analysis pass for a given IR unit. /// - /// If there is not a valid cached result in the manager already, this will - /// re-run the analysis to produce a valid result. + /// Runs the analysis if a cached result is not available. template typename PassT::Result &getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs) { assert(AnalysisPasses.count(PassT::ID()) && @@ -661,7 +655,7 @@ template class AnalysisManager { return static_cast(ResultConcept).Result; } - /// \brief Get the cached result of an analysis pass for this module. + /// \brief Get the cached result of an analysis pass for a given IR unit. /// /// This method never runs the analysis. /// @@ -683,22 +677,21 @@ template class AnalysisManager { /// \brief Register an analysis pass with the manager. /// - /// The argument is a callable whose result is a pass. This allows passing in - /// a lambda to construct the pass. + /// The parameter is a callable whose result is an analysis pass. This allows + /// passing in a lambda to construct the analysis. /// - /// The pass type registered is the result type of calling the argument. If - /// that pass has already been registered, then the argument will not be - /// called and this function will return false. Otherwise, the pass type - /// becomes registered, with the instance provided by calling the argument - /// once, and this function returns true. + /// The analysis type to register is the type returned by calling the \c + /// PassBuilder argument. If that type has already been registered, then the + /// argument will not be called and this function will return false. + /// Otherwise, we register the analysis returned by calling \c PassBuilder(), + /// and this function returns true. /// - /// While this returns whether or not the pass type was already registered, - /// there in't an independent way to query that as that would be prone to - /// risky use when *querying* the analysis manager. Instead, the only - /// supported use case is avoiding duplicate registry of an analysis. This - /// interface also lends itself to minimizing the number of times we have to - /// do lookups for analyses or construct complex passes only to throw them - /// away. + /// (Note: Although the return value of this function indicates whether or not + /// an analysis was previously registered, there intentionally isn't a way to + /// query this directly. Instead, you should just register all the analyses + /// you might want and let this class run them lazily. This idiom lets us + /// minimize the number of times we have to look up analyses in our + /// hashtable.) template bool registerPass(PassBuilderT &&PassBuilder) { typedef decltype(PassBuilder()) PassT; @@ -718,17 +711,18 @@ template class AnalysisManager { /// \brief Invalidate a specific analysis pass for an IR module. /// - /// Note that the analysis result can disregard invalidation. + /// Note that the analysis result can disregard invalidation, if it determines + /// it is in fact still valid. template void invalidate(IRUnitT &IR) { assert(AnalysisPasses.count(PassT::ID()) && "This analysis pass was not registered prior to being invalidated"); invalidateImpl(PassT::ID(), IR); } - /// \brief Invalidate analyses cached for an IR unit. + /// \brief Invalidate cached analyses for an IR unit. /// /// Walk through all of the analyses pertaining to this unit of IR and - /// invalidate them unless they are preserved by the PreservedAnalyses set. + /// invalidate them, unless they are preserved by the PreservedAnalyses set. void invalidate(IRUnitT &IR, const PreservedAnalyses &PA) { // We're done if all analyses on this IR unit are preserved. if (PA.allAnalysesInSetPreserved>()) @@ -738,8 +732,8 @@ template class AnalysisManager { dbgs() << "Invalidating all non-preserved analyses for: " << IR.getName() << "\n"; - // Track whether each pass's result is invalidated. Memoize the results - // using the IsResultInvalidated map. + // Track whether each analysis's result is invalidated in + // IsResultInvalidated. SmallDenseMap IsResultInvalidated; Invalidator Inv(IsResultInvalidated, AnalysisResults); AnalysisResultListT &ResultsList = AnalysisResultLists[&IR]; @@ -758,9 +752,9 @@ template class AnalysisManager { // Try to invalidate the result, giving it the Invalidator so it can // recursively query for any dependencies it has and record the result. - // Note that we cannot re-use 'IMapI' here or pre-insert the ID as the - // invalidate method may insert things into the map as well, invalidating - // any iterator or pointer. + // Note that we cannot reuse 'IMapI' here or pre-insert the ID, as + // Result.invalidate may insert things into the map, invalidating our + // iterator. bool Inserted = IsResultInvalidated.insert({ID, Result.invalidate(IR, PA, Inv)}) .second; @@ -873,7 +867,7 @@ template class AnalysisManager { /// analysis result. AnalysisResultMapT AnalysisResults; - /// \brief A flag indicating whether debug logging is enabled. + /// \brief Indicates whether we log to \c llvm::dbgs(). bool DebugLogging; }; diff --git a/include/llvm/LTO/LTO.h b/include/llvm/LTO/LTO.h index bc435702157..78ac73a7418 100644 --- a/include/llvm/LTO/LTO.h +++ b/include/llvm/LTO/LTO.h @@ -382,6 +382,10 @@ class LTO { /// The unmangled name of the global. std::string IRName; + /// Keep track if the symbol is visible outside of ThinLTO (i.e. in + /// either a regular object or the regular LTO partition). + bool VisibleOutsideThinLTO = false; + bool UnnamedAddr = true; /// This field keeps track of the partition number of this global. The @@ -405,6 +409,9 @@ class LTO { /// This global is either used by more than one partition or has an /// external reference, and therefore cannot be internalized. External = -2u, + + /// The RegularLTO partition + RegularLTO = 0, }; }; diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h index a300c4f6fb0..25642379ac9 100644 --- a/include/llvm/MC/MCTargetOptions.h +++ b/include/llvm/MC/MCTargetOptions.h @@ -11,6 +11,7 @@ #define LLVM_MC_MCTARGETOPTIONS_H #include +#include namespace llvm { @@ -51,11 +52,17 @@ class MCTargetOptions { bool PreserveAsmComments : 1; int DwarfVersion; + /// getABIName - If this returns a non-empty string this represents the /// textual name of the ABI that we want the backend to use, e.g. o32, or /// aapcs-linux. StringRef getABIName() const; std::string ABIName; + + /// Additional paths to search for `.include` directives when using the + /// integrated assembler. + std::vector IASSearchPaths; + MCTargetOptions(); }; @@ -75,7 +82,8 @@ inline bool operator==(const MCTargetOptions &LHS, const MCTargetOptions &RHS) { ARE_EQUAL(ShowMCInst) && ARE_EQUAL(AsmVerbose) && ARE_EQUAL(DwarfVersion) && - ARE_EQUAL(ABIName)); + ARE_EQUAL(ABIName) && + ARE_EQUAL(IASSearchPaths)); #undef ARE_EQUAL } diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h index 586999794d5..ad21d8af66e 100644 --- a/include/llvm/Support/FileSystem.h +++ b/include/llvm/Support/FileSystem.h @@ -27,7 +27,6 @@ #ifndef LLVM_SUPPORT_FILESYSTEM_H #define LLVM_SUPPORT_FILESYSTEM_H -#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" @@ -37,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -829,28 +829,23 @@ class directory_iterator { }; namespace detail { - /// RecDirIterState - Keeps state for the recursive_directory_iterator. It is - /// reference counted in order to preserve InputIterator semantics on copy. - struct RecDirIterState : public RefCountedBase { - RecDirIterState() - : Level(0) - , HasNoPushRequest(false) {} - + /// Keeps state for the recursive_directory_iterator. + struct RecDirIterState { std::stack> Stack; - uint16_t Level; - bool HasNoPushRequest; + uint16_t Level = 0; + bool HasNoPushRequest = false; }; } // end namespace detail /// recursive_directory_iterator - Same as directory_iterator except for it /// recurses down into child directories. class recursive_directory_iterator { - IntrusiveRefCntPtr State; + std::shared_ptr State; public: recursive_directory_iterator() = default; explicit recursive_directory_iterator(const Twine &path, std::error_code &ec) - : State(new detail::RecDirIterState) { + : State(std::make_shared()) { State->Stack.push(directory_iterator(path, ec)); if (State->Stack.top() == directory_iterator()) State.reset(); diff --git a/include/llvm/Support/TarWriter.h b/include/llvm/Support/TarWriter.h new file mode 100644 index 00000000000..44bdcaf2c46 --- /dev/null +++ b/include/llvm/Support/TarWriter.h @@ -0,0 +1,32 @@ +//===-- llvm/Support/TarWriter.h - Tar archive file creator -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_TAR_WRITER_H +#define LLVM_SUPPORT_TAR_WRITER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { +class TarWriter { +public: + static Expected> create(StringRef OutputPath, + StringRef BaseDir); + + void append(StringRef Path, StringRef Data); + +private: + TarWriter(int FD, StringRef BaseDir); + raw_fd_ostream OS; + std::string BaseDir; +}; +} + +#endif diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h index d7acbe883c5..eaea092c917 100644 --- a/include/llvm/Transforms/IPO/FunctionImport.h +++ b/include/llvm/Transforms/IPO/FunctionImport.h @@ -86,11 +86,15 @@ class FunctionImportPass : public PassInfoMixin { /// \p ExportLists contains for each Module the set of globals (GUID) that will /// be imported by another module, or referenced by such a function. I.e. this /// is the set of globals that need to be promoted/renamed appropriately. +/// +/// \p DeadSymbols (optional) contains a list of GUID that are deemed "dead" and +/// will be ignored for the purpose of importing. void ComputeCrossModuleImport( const ModuleSummaryIndex &Index, const StringMap &ModuleToDefinedGVSummaries, StringMap &ImportLists, - StringMap &ExportLists); + StringMap &ExportLists, + const DenseSet *DeadSymbols = nullptr); /// Compute all the imports for the given module using the Index. /// @@ -100,6 +104,13 @@ void ComputeCrossModuleImportForModule( StringRef ModulePath, const ModuleSummaryIndex &Index, FunctionImporter::ImportMapTy &ImportList); +/// Compute all the symbols that are "dead": i.e these that can't be reached +/// in the graph from any of the given symbols listed in +/// \p GUIDPreservedSymbols. +DenseSet +computeDeadSymbols(const ModuleSummaryIndex &Index, + const DenseSet &GUIDPreservedSymbols); + /// Compute the set of summaries needed for a ThinLTO backend compilation of /// \p ModulePath. // diff --git a/include/llvm/Transforms/IPO/LowerTypeTests.h b/include/llvm/Transforms/IPO/LowerTypeTests.h index 23c59c199a3..ca6e1b878df 100644 --- a/include/llvm/Transforms/IPO/LowerTypeTests.h +++ b/include/llvm/Transforms/IPO/LowerTypeTests.h @@ -60,10 +60,6 @@ struct BitSetInfo { bool containsGlobalOffset(uint64_t Offset) const; - bool containsValue(const DataLayout &DL, - const DenseMap &GlobalLayout, - Value *V, uint64_t COffset = 0) const; - void print(raw_ostream &OS) const; }; diff --git a/include/llvm/Transforms/Utils/FunctionImportUtils.h b/include/llvm/Transforms/Utils/FunctionImportUtils.h index 57b7d0fcd7c..f18cd92310b 100644 --- a/include/llvm/Transforms/Utils/FunctionImportUtils.h +++ b/include/llvm/Transforms/Utils/FunctionImportUtils.h @@ -40,9 +40,20 @@ class FunctionImportGlobalProcessing { /// as part of a different backend compilation process. bool HasExportedFunctions = false; + /// Set of llvm.*used values, in order to validate that we don't try + /// to promote any non-renamable values. + SmallPtrSet Used; + /// Check if we should promote the given local value to global scope. bool shouldPromoteLocalToGlobal(const GlobalValue *SGV); +#ifndef NDEBUG + /// Check if the given value is a local that can't be renamed (promoted). + /// Only used in assertion checking, and disabled under NDEBUG since the Used + /// set will not be populated. + bool isNonRenamableLocal(const GlobalValue &GV) const; +#endif + /// Helper methods to check if we are importing from or potentially /// exporting from the current source module. bool isPerformingImport() const { return GlobalsToImport != nullptr; } @@ -82,6 +93,13 @@ class FunctionImportGlobalProcessing { // may be exported to another backend compilation. if (!GlobalsToImport) HasExportedFunctions = ImportIndex.hasExportedFunctions(M); + +#ifndef NDEBUG + // First collect those in the llvm.used set. + collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false); + // Next collect those in the llvm.compiler.used set. + collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ true); +#endif } bool run(); diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap index a86bc7e7fcb..29e6d66b27f 100644 --- a/include/llvm/module.modulemap +++ b/include/llvm/module.modulemap @@ -151,6 +151,7 @@ module LLVM_intrinsic_gen { module IR_NoFolder { header "IR/NoFolder.h" export * } module IR_Module { header "IR/Module.h" export * } module IR_ModuleSummaryIndex { header "IR/ModuleSummaryIndex.h" export * } + module IR_ModuleSummaryIndexYAML { header "IR/ModuleSummaryIndexYAML.h" export * } module IR_Function { header "IR/Function.h" export * } module IR_InstrTypes { header "IR/InstrTypes.h" export * } module IR_Instructions { header "IR/Instructions.h" export * } diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index 1d2ffc1abe1..6387bb36166 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -80,10 +80,15 @@ static CalleeInfo::HotnessType getHotness(uint64_t ProfileCount, return CalleeInfo::HotnessType::None; } -static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, - const Function &F, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, - bool HasLocalsInUsed) { +static bool isNonRenamableLocal(const GlobalValue &GV) { + return GV.hasSection() && GV.hasLocalLinkage(); +} + +static void +computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, + const Function &F, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, bool HasLocalsInUsed, + DenseSet &CantBePromoted) { // Summary not currently supported for anonymous functions, they should // have been named. assert(F.hasName()); @@ -178,37 +183,64 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, } } - GlobalValueSummary::GVFlags Flags(F); + bool NonRenamableLocal = isNonRenamableLocal(F); + bool NotEligibleForImport = + NonRenamableLocal || HasInlineAsmMaybeReferencingInternal || + // Inliner doesn't handle variadic functions. + // FIXME: refactor this to use the same code that inliner is using. + F.isVarArg(); + GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport, + /* LiveRoot = */ false); auto FuncSummary = llvm::make_unique( Flags, NumInsts, RefEdges.takeVector(), CallGraphEdges.takeVector(), TypeTests.takeVector()); - if (HasInlineAsmMaybeReferencingInternal) - FuncSummary->setHasInlineAsmMaybeReferencingInternal(); + if (NonRenamableLocal) + CantBePromoted.insert(F.getGUID()); Index.addGlobalValueSummary(F.getName(), std::move(FuncSummary)); } -static void computeVariableSummary(ModuleSummaryIndex &Index, - const GlobalVariable &V) { +static void +computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, + DenseSet &CantBePromoted) { SetVector RefEdges; SmallPtrSet Visited; findRefEdges(&V, RefEdges, Visited); - GlobalValueSummary::GVFlags Flags(V); + bool NonRenamableLocal = isNonRenamableLocal(V); + GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal, + /* LiveRoot = */ false); auto GVarSummary = llvm::make_unique(Flags, RefEdges.takeVector()); + if (NonRenamableLocal) + CantBePromoted.insert(V.getGUID()); Index.addGlobalValueSummary(V.getName(), std::move(GVarSummary)); } -static void computeAliasSummary(ModuleSummaryIndex &Index, - const GlobalAlias &A) { - GlobalValueSummary::GVFlags Flags(A); +static void +computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A, + DenseSet &CantBePromoted) { + bool NonRenamableLocal = isNonRenamableLocal(A); + GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal, + /* LiveRoot = */ false); auto AS = llvm::make_unique(Flags, ArrayRef{}); auto *Aliasee = A.getBaseObject(); auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee); assert(AliaseeSummary && "Alias expects aliasee summary to be parsed"); AS->setAliasee(AliaseeSummary); + if (NonRenamableLocal) + CantBePromoted.insert(A.getGUID()); Index.addGlobalValueSummary(A.getName(), std::move(AS)); } +// Set LiveRoot flag on entries matching the given value name. +static void setLiveRoot(ModuleSummaryIndex &Index, StringRef Name) { + auto SummaryList = + Index.findGlobalValueSummaryList(GlobalValue::getGUID(Name)); + if (SummaryList == Index.end()) + return; + for (auto &Summary : SummaryList->second) + Summary->setLiveRoot(); +} + ModuleSummaryIndex llvm::buildModuleSummaryIndex( const Module &M, std::function GetBFICallback, @@ -226,9 +258,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false); // Next collect those in the llvm.compiler.used set. collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ true); + DenseSet CantBePromoted; for (auto *V : Used) { - if (V->hasLocalLinkage()) + if (V->hasLocalLinkage()) { LocalsUsed.insert(V); + CantBePromoted.insert(V->getGUID()); + } } // Compute summaries for all functions defined in module, and save in the @@ -248,7 +283,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( BFI = BFIPtr.get(); } - computeFunctionSummary(Index, M, F, BFI, PSI, !LocalsUsed.empty()); + computeFunctionSummary(Index, M, F, BFI, PSI, !LocalsUsed.empty(), + CantBePromoted); } // Compute summaries for all variables defined in module, and save in the @@ -256,20 +292,29 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( for (const GlobalVariable &G : M.globals()) { if (G.isDeclaration()) continue; - computeVariableSummary(Index, G); + computeVariableSummary(Index, G, CantBePromoted); } // Compute summaries for all aliases defined in module, and save in the // index. for (const GlobalAlias &A : M.aliases()) - computeAliasSummary(Index, A); + computeAliasSummary(Index, A, CantBePromoted); for (auto *V : LocalsUsed) { auto *Summary = Index.getGlobalValueSummary(*V); assert(Summary && "Missing summary for global value"); - Summary->setNoRename(); + Summary->setNotEligibleToImport(); } + // The linker doesn't know about these LLVM produced values, so we need + // to flag them as live in the index to ensure index-based dead value + // analysis treats them as live roots of the analysis. + setLiveRoot(Index, "llvm.used"); + setLiveRoot(Index, "llvm.compiler.used"); + setLiveRoot(Index, "llvm.global_ctors"); + setLiveRoot(Index, "llvm.global_dtors"); + setLiveRoot(Index, "llvm.global.annotations"); + if (!M.getModuleInlineAsm().empty()) { // Collect the local values defined by module level asm, and set up // summaries for these symbols so that they can be marked as NoRename, @@ -282,7 +327,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( // referenced from there. ModuleSymbolTable::CollectAsmSymbols( Triple(M.getTargetTriple()), M.getModuleInlineAsm(), - [&M, &Index](StringRef Name, object::BasicSymbolRef::Flags Flags) { + [&M, &Index, &CantBePromoted](StringRef Name, + object::BasicSymbolRef::Flags Flags) { // Symbols not marked as Weak or Global are local definitions. if (Flags & (object::BasicSymbolRef::SF_Weak | object::BasicSymbolRef::SF_Global)) @@ -291,11 +337,10 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( if (!GV) return; assert(GV->isDeclaration() && "Def in module asm already has definition"); - GlobalValueSummary::GVFlags GVFlags( - GlobalValue::InternalLinkage, - /* NoRename */ true, - /* HasInlineAsmMaybeReferencingInternal */ false, - /* IsNotViableToInline */ true); + GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage, + /* NotEligibleToImport */ true, + /* LiveRoot */ true); + CantBePromoted.insert(GlobalValue::getGUID(Name)); // Create the appropriate summary type. if (isa(GV)) { std::unique_ptr Summary = @@ -303,18 +348,41 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( GVFlags, 0, ArrayRef{}, ArrayRef{}, ArrayRef{}); - Summary->setNoRename(); Index.addGlobalValueSummary(Name, std::move(Summary)); } else { std::unique_ptr Summary = llvm::make_unique(GVFlags, ArrayRef{}); - Summary->setNoRename(); Index.addGlobalValueSummary(Name, std::move(Summary)); } }); } + for (auto &GlobalList : Index) { + assert(GlobalList.second.size() == 1 && + "Expected module's index to have one summary per GUID"); + auto &Summary = GlobalList.second[0]; + bool AllRefsCanBeExternallyReferenced = + llvm::all_of(Summary->refs(), [&](const ValueInfo &VI) { + return !CantBePromoted.count(VI.getValue()->getGUID()); + }); + if (!AllRefsCanBeExternallyReferenced) { + Summary->setNotEligibleToImport(); + continue; + } + + if (auto *FuncSummary = dyn_cast(Summary.get())) { + bool AllCallsCanBeExternallyReferenced = llvm::all_of( + FuncSummary->calls(), [&](const FunctionSummary::EdgeTy &Edge) { + auto GUID = Edge.first.isGUID() ? Edge.first.getGUID() + : Edge.first.getValue()->getGUID(); + return !CantBePromoted.count(GUID); + }); + if (!AllCallsCanBeExternallyReferenced) + Summary->setNotEligibleToImport(); + } + } + return Index; } diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 2a15b9b264e..cd8c24630df 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -389,8 +389,9 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const { } int TargetTransformInfo::getAddressComputationCost(Type *Tp, - bool IsComplex) const { - int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex); + ScalarEvolution *SE, + const SCEV *Ptr) const { + int Cost = TTIImpl->getAddressComputationCost(Tp, SE, Ptr); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index 03aefcf5711..d9e249aad21 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -801,12 +801,12 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags, // to getDecodedLinkage() will need to be taken into account here as above. auto Linkage = GlobalValue::LinkageTypes(RawFlags & 0xF); // 4 bits RawFlags = RawFlags >> 4; - bool NoRename = RawFlags & 0x1; - bool IsNotViableToInline = RawFlags & 0x2; - bool HasInlineAsmMaybeReferencingInternal = RawFlags & 0x4; - return GlobalValueSummary::GVFlags(Linkage, NoRename, - HasInlineAsmMaybeReferencingInternal, - IsNotViableToInline); + bool NotEligibleToImport = (RawFlags & 0x1) || Version < 3; + // The LiveRoot flag wasn't introduced until version 3. For dead stripping + // to work correctly on earlier versions, we must conservatively treat all + // values as live. + bool LiveRoot = (RawFlags & 0x2) || Version < 3; + return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, LiveRoot); } static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) { @@ -4838,9 +4838,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary( } const uint64_t Version = Record[0]; const bool IsOldProfileFormat = Version == 1; - if (!IsOldProfileFormat && Version != 2) + if (Version < 1 || Version > 3) return error("Invalid summary version " + Twine(Version) + - ", 1 or 2 expected"); + ", 1, 2 or 3 expected"); Record.clear(); // Keep around the last seen summary to be used when we see an optional diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp index 43c9aebd79e..771cf3d927b 100644 --- a/lib/Bitcode/Reader/BitstreamReader.cpp +++ b/lib/Bitcode/Reader/BitstreamReader.cpp @@ -93,20 +93,29 @@ static void skipAbbreviatedField(BitstreamCursor &Cursor, } /// skipRecord - Read the current record and discard it. -void BitstreamCursor::skipRecord(unsigned AbbrevID) { +unsigned BitstreamCursor::skipRecord(unsigned AbbrevID) { // Skip unabbreviated records by reading past their entries. if (AbbrevID == bitc::UNABBREV_RECORD) { unsigned Code = ReadVBR(6); - (void)Code; unsigned NumElts = ReadVBR(6); for (unsigned i = 0; i != NumElts; ++i) (void)ReadVBR64(6); - return; + return Code; } const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID); + const BitCodeAbbrevOp &CodeOp = Abbv->getOperandInfo(0); + unsigned Code; + if (CodeOp.isLiteral()) + Code = CodeOp.getLiteralValue(); + else { + if (CodeOp.getEncoding() == BitCodeAbbrevOp::Array || + CodeOp.getEncoding() == BitCodeAbbrevOp::Blob) + report_fatal_error("Abbreviation starts with an Array or a Blob"); + Code = readAbbreviatedField(*this, CodeOp); + } - for (unsigned i = 0, e = Abbv->getNumOperandInfos(); i != e; ++i) { + for (unsigned i = 1, e = Abbv->getNumOperandInfos(); i < e; ++i) { const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i); if (Op.isLiteral()) continue; @@ -164,6 +173,7 @@ void BitstreamCursor::skipRecord(unsigned AbbrevID) { // Skip over the blob. JumpToBit(NewEnd); } + return Code; } unsigned BitstreamCursor::readRecord(unsigned AbbrevID, @@ -273,7 +283,7 @@ unsigned BitstreamCursor::readRecord(unsigned AbbrevID, } void BitstreamCursor::ReadAbbrevRecord() { - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); unsigned NumOpInfo = ReadVBR(5); for (unsigned i = 0; i != NumOpInfo; ++i) { bool IsLiteral = Read(1); @@ -307,7 +317,7 @@ void BitstreamCursor::ReadAbbrevRecord() { if (Abbv->getNumOperandInfos() == 0) report_fatal_error("Abbrev record with no operands"); - CurAbbrevs.push_back(Abbv); + CurAbbrevs.push_back(std::move(Abbv)); } Optional diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp index 5da421a79b7..460d39cc28d 100644 --- a/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/lib/Bitcode/Reader/MetadataLoader.cpp @@ -14,10 +14,12 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" @@ -86,12 +88,23 @@ using namespace llvm; +#define DEBUG_TYPE "bitcode-reader" + +STATISTIC(NumMDStringLoaded, "Number of MDStrings loaded"); +STATISTIC(NumMDNodeTemporary, "Number of MDNode::Temporary created"); +STATISTIC(NumMDRecordLoaded, "Number of Metadata records loaded"); + /// Flag whether we need to import full type definitions for ThinLTO. /// Currently needed for Darwin and LLDB. static cl::opt ImportFullTypeDefinitions( "import-full-type-definitions", cl::init(false), cl::Hidden, cl::desc("Import full type definitions for ThinLTO.")); +static cl::opt DisableLazyLoading( + "disable-ondemand-mds-loading", cl::init(false), cl::Hidden, + cl::desc("Force disable the lazy-loading on-demand of metadata when " + "loading bitcode for importing.")); + namespace { static int64_t unrotateSign(uint64_t U) { return U & 1 ? ~(U >> 1) : U >> 1; } @@ -165,6 +178,10 @@ class BitcodeReaderMetadataList { void assignValue(Metadata *MD, unsigned Idx); void tryToResolveCycles(); bool hasFwdRefs() const { return !ForwardReference.empty(); } + int getNextFwdRef() { + assert(hasFwdRefs()); + return *ForwardReference.begin(); + } /// Upgrade a type that had an MDString reference. void addTypeRef(MDString &UUID, DICompositeType &CT); @@ -215,6 +232,7 @@ Metadata *BitcodeReaderMetadataList::getMetadataFwdRef(unsigned Idx) { ForwardReference.insert(Idx); // Create and return a placeholder, which will later be RAUW'd. + ++NumMDNodeTemporary; Metadata *MD = MDNode::getTemporary(Context, None).release(); MetadataPtrs[Idx].reset(MD); return MD; @@ -340,8 +358,26 @@ class PlaceholderQueue { std::deque PHs; public: + bool empty() { return PHs.empty(); } DistinctMDOperandPlaceholder &getPlaceholderOp(unsigned ID); void flush(BitcodeReaderMetadataList &MetadataList); + + /// Return the list of temporaries nodes in the queue, these need to be + /// loaded before we can flush the queue. + void getTemporaries(BitcodeReaderMetadataList &MetadataList, + DenseSet &Temporaries) { + for (auto &PH : PHs) { + auto ID = PH.getID(); + auto *MD = MetadataList.lookup(ID); + if (!MD) { + Temporaries.insert(ID); + continue; + } + auto *N = dyn_cast_or_null(MD); + if (N && N->isTemporary()) + Temporaries.insert(ID); + } + } }; } // end anonymous namespace @@ -375,6 +411,30 @@ class MetadataLoader::MetadataLoaderImpl { Module &TheModule; std::function getTypeByID; + /// Cursor associated with the lazy-loading of Metadata. This is the easy way + /// to keep around the right "context" (Abbrev list) to be able to jump in + /// the middle of the metadata block and load any record. + BitstreamCursor IndexCursor; + + /// Index that keeps track of MDString values. + std::vector MDStringRef; + + /// On-demand loading of a single MDString. Requires the index above to be + /// populated. + MDString *lazyLoadOneMDString(unsigned Idx); + + /// Index that keeps track of where to find a metadata record in the stream. + std::vector GlobalMetadataBitPosIndex; + + /// Populate the index above to enable lazily loading of metadata, and load + /// the named metadata as well as the transitively referenced global + /// Metadata. + Expected lazyLoadModuleMetadataBlock(PlaceholderQueue &Placeholders); + + /// On-demand loading of a single metadata. Requires the index above to be + /// populated. + void lazyLoadOneMetadata(unsigned Idx, PlaceholderQueue &Placeholders); + // Keep mapping of seens pair of old-style CU <-> SP, and update pointers to // point from SP to CU after a block is completly parsed. std::vector> CUSubprograms; @@ -394,13 +454,25 @@ class MetadataLoader::MetadataLoaderImpl { Error parseOneMetadata(SmallVectorImpl &Record, unsigned Code, PlaceholderQueue &Placeholders, StringRef Blob, - bool ModuleLevel, unsigned &NextMetadataNo); + unsigned &NextMetadataNo); Error parseMetadataStrings(ArrayRef Record, StringRef Blob, - unsigned &NextMetadataNo); + std::function CallBack); Error parseGlobalObjectAttachment(GlobalObject &GO, ArrayRef Record); Error parseMetadataKindRecord(SmallVectorImpl &Record); + void resolveForwardRefsAndPlaceholders(PlaceholderQueue &Placeholders); + + /// Upgrade old-style CU <-> SP pointers to point from SP to CU. + void upgradeCUSubprograms() { + for (auto CU_SP : CUSubprograms) + if (auto *SPs = dyn_cast_or_null(CU_SP.second)) + for (auto &Op : SPs->operands()) + if (auto *SP = dyn_cast_or_null(Op)) + SP->replaceOperandWith(7, CU_SP.first); + CUSubprograms.clear(); + } + public: MetadataLoaderImpl(BitstreamCursor &Stream, Module &TheModule, BitcodeReaderValueList &ValueList, @@ -444,20 +516,217 @@ Error error(const Twine &Message) { Message, make_error_code(BitcodeError::CorruptedBitcode)); } +Expected MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock( + PlaceholderQueue &Placeholders) { + IndexCursor = Stream; + SmallVector Record; + // Get the abbrevs, and preload record positions to make them lazy-loadable. + while (true) { + BitstreamEntry Entry = IndexCursor.advanceSkippingSubblocks( + BitstreamCursor::AF_DontPopBlockAtEnd); + switch (Entry.Kind) { + case BitstreamEntry::SubBlock: // Handled for us already. + case BitstreamEntry::Error: + return error("Malformed block"); + case BitstreamEntry::EndBlock: { + return true; + } + case BitstreamEntry::Record: { + // The interesting case. + ++NumMDRecordLoaded; + uint64_t CurrentPos = IndexCursor.GetCurrentBitNo(); + auto Code = IndexCursor.skipRecord(Entry.ID); + switch (Code) { + case bitc::METADATA_STRINGS: { + // Rewind and parse the strings. + IndexCursor.JumpToBit(CurrentPos); + StringRef Blob; + Record.clear(); + IndexCursor.readRecord(Entry.ID, Record, &Blob); + unsigned NumStrings = Record[0]; + MDStringRef.reserve(NumStrings); + auto IndexNextMDString = [&](StringRef Str) { + MDStringRef.push_back(Str); + }; + if (auto Err = parseMetadataStrings(Record, Blob, IndexNextMDString)) + return std::move(Err); + break; + } + case bitc::METADATA_INDEX_OFFSET: { + // This is the offset to the index, when we see this we skip all the + // records and load only an index to these. + IndexCursor.JumpToBit(CurrentPos); + Record.clear(); + IndexCursor.readRecord(Entry.ID, Record); + if (Record.size() != 2) + return error("Invalid record"); + auto Offset = Record[0] + (Record[1] << 32); + auto BeginPos = IndexCursor.GetCurrentBitNo(); + IndexCursor.JumpToBit(BeginPos + Offset); + Entry = IndexCursor.advanceSkippingSubblocks( + BitstreamCursor::AF_DontPopBlockAtEnd); + assert(Entry.Kind == BitstreamEntry::Record && + "Corrupted bitcode: Expected `Record` when trying to find the " + "Metadata index"); + Record.clear(); + auto Code = IndexCursor.readRecord(Entry.ID, Record); + (void)Code; + assert(Code == bitc::METADATA_INDEX && "Corrupted bitcode: Expected " + "`METADATA_INDEX` when trying " + "to find the Metadata index"); + + // Delta unpack + auto CurrentValue = BeginPos; + GlobalMetadataBitPosIndex.reserve(Record.size()); + for (auto &Elt : Record) { + CurrentValue += Elt; + GlobalMetadataBitPosIndex.push_back(CurrentValue); + } + break; + } + case bitc::METADATA_INDEX: + // We don't expect to get there, the Index is loaded when we encounter + // the offset. + return error("Corrupted Metadata block"); + case bitc::METADATA_NAME: { + // Named metadata need to be materialized now and aren't deferred. + IndexCursor.JumpToBit(CurrentPos); + Record.clear(); + unsigned Code = IndexCursor.readRecord(Entry.ID, Record); + assert(Code == bitc::METADATA_NAME); + + // Read name of the named metadata. + SmallString<8> Name(Record.begin(), Record.end()); + Code = IndexCursor.ReadCode(); + + // Named Metadata comes in two parts, we expect the name to be followed + // by the node + Record.clear(); + unsigned NextBitCode = IndexCursor.readRecord(Code, Record); + assert(NextBitCode == bitc::METADATA_NAMED_NODE); + (void)NextBitCode; + + // Read named metadata elements. + unsigned Size = Record.size(); + NamedMDNode *NMD = TheModule.getOrInsertNamedMetadata(Name); + for (unsigned i = 0; i != Size; ++i) { + // FIXME: We could use a placeholder here, however NamedMDNode are + // taking MDNode as operand and not using the Metadata infrastructure. + // It is acknowledged by 'TODO: Inherit from Metadata' in the + // NamedMDNode class definition. + MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]); + assert(MD && "Invalid record"); + NMD->addOperand(MD); + } + break; + } + case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: { + // FIXME: we need to do this early because we don't materialize global + // value explicitly. + IndexCursor.JumpToBit(CurrentPos); + Record.clear(); + IndexCursor.readRecord(Entry.ID, Record); + if (Record.size() % 2 == 0) + return error("Invalid record"); + unsigned ValueID = Record[0]; + if (ValueID >= ValueList.size()) + return error("Invalid record"); + if (auto *GO = dyn_cast(ValueList[ValueID])) + if (Error Err = parseGlobalObjectAttachment( + *GO, ArrayRef(Record).slice(1))) + return std::move(Err); + break; + } + case bitc::METADATA_KIND: + case bitc::METADATA_STRING_OLD: + case bitc::METADATA_OLD_FN_NODE: + case bitc::METADATA_OLD_NODE: + case bitc::METADATA_VALUE: + case bitc::METADATA_DISTINCT_NODE: + case bitc::METADATA_NODE: + case bitc::METADATA_LOCATION: + case bitc::METADATA_GENERIC_DEBUG: + case bitc::METADATA_SUBRANGE: + case bitc::METADATA_ENUMERATOR: + case bitc::METADATA_BASIC_TYPE: + case bitc::METADATA_DERIVED_TYPE: + case bitc::METADATA_COMPOSITE_TYPE: + case bitc::METADATA_SUBROUTINE_TYPE: + case bitc::METADATA_MODULE: + case bitc::METADATA_FILE: + case bitc::METADATA_COMPILE_UNIT: + case bitc::METADATA_SUBPROGRAM: + case bitc::METADATA_LEXICAL_BLOCK: + case bitc::METADATA_LEXICAL_BLOCK_FILE: + case bitc::METADATA_NAMESPACE: + case bitc::METADATA_MACRO: + case bitc::METADATA_MACRO_FILE: + case bitc::METADATA_TEMPLATE_TYPE: + case bitc::METADATA_TEMPLATE_VALUE: + case bitc::METADATA_GLOBAL_VAR: + case bitc::METADATA_LOCAL_VAR: + case bitc::METADATA_EXPRESSION: + case bitc::METADATA_OBJC_PROPERTY: + case bitc::METADATA_IMPORTED_ENTITY: + case bitc::METADATA_GLOBAL_VAR_EXPR: + // We don't expect to see any of these, if we see one, give up on + // lazy-loading and fallback. + MDStringRef.clear(); + GlobalMetadataBitPosIndex.clear(); + return false; + } + break; + } + } + } +} + /// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing /// module level metadata. Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) { if (!ModuleLevel && MetadataList.hasFwdRefs()) return error("Invalid metadata: fwd refs into function blocks"); + // Record the entry position so that we can jump back here and efficiently + // skip the whole block in case we lazy-load. + auto EntryPos = Stream.GetCurrentBitNo(); + if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID)) return error("Invalid record"); - unsigned NextMetadataNo = MetadataList.size(); SmallVector Record; - PlaceholderQueue Placeholders; + // We lazy-load module-level metadata: we build an index for each record, and + // then load individual record as needed, starting with the named metadata. + if (ModuleLevel && IsImporting && MetadataList.empty() && + !DisableLazyLoading) { + auto SuccessOrErr = lazyLoadModuleMetadataBlock(Placeholders); + if (!SuccessOrErr) + return SuccessOrErr.takeError(); + if (SuccessOrErr.get()) { + // An index was successfully created and we will be able to load metadata + // on-demand. + MetadataList.resize(MDStringRef.size() + + GlobalMetadataBitPosIndex.size()); + + // Reading the named metadata created forward references and/or + // placeholders, that we flush here. + resolveForwardRefsAndPlaceholders(Placeholders); + upgradeCUSubprograms(); + // Return at the beginning of the block, since it is easy to skip it + // entirely from there. + Stream.ReadBlockEnd(); // Pop the abbrev block context. + Stream.JumpToBit(EntryPos); + if (Stream.SkipBlock()) + return error("Invalid record"); + return Error::success(); + } + // Couldn't load an index, fallback to loading all the block "old-style". + } + + unsigned NextMetadataNo = MetadataList.size(); + // Read all the records. while (true) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); @@ -467,16 +736,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) { case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: - // Upgrade old-style CU <-> SP pointers to point from SP to CU. - for (auto CU_SP : CUSubprograms) - if (auto *SPs = dyn_cast_or_null(CU_SP.second)) - for (auto &Op : SPs->operands()) - if (auto *SP = dyn_cast_or_null(Op)) - SP->replaceOperandWith(7, CU_SP.first); - CUSubprograms.clear(); - - MetadataList.tryToResolveCycles(); - Placeholders.flush(MetadataList); + resolveForwardRefsAndPlaceholders(Placeholders); + upgradeCUSubprograms(); return Error::success(); case BitstreamEntry::Record: // The interesting case. @@ -486,20 +747,86 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) { // Read a record. Record.clear(); StringRef Blob; + ++NumMDRecordLoaded; unsigned Code = Stream.readRecord(Entry.ID, Record, &Blob); - if (Error Err = parseOneMetadata(Record, Code, Placeholders, Blob, - ModuleLevel, NextMetadataNo)) + if (Error Err = + parseOneMetadata(Record, Code, Placeholders, Blob, NextMetadataNo)) return Err; } } +MDString *MetadataLoader::MetadataLoaderImpl::lazyLoadOneMDString(unsigned ID) { + ++NumMDStringLoaded; + if (Metadata *MD = MetadataList.lookup(ID)) + return cast(MD); + auto MDS = MDString::get(Context, MDStringRef[ID]); + MetadataList.assignValue(MDS, ID); + return MDS; +} + +void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata( + unsigned ID, PlaceholderQueue &Placeholders) { + assert(ID < (MDStringRef.size()) + GlobalMetadataBitPosIndex.size()); + assert(ID >= MDStringRef.size() && "Unexpected lazy-loading of MDString"); +#ifndef NDEBUG + // Lookup first if the metadata hasn't already been loaded. + if (auto *MD = MetadataList.lookup(ID)) { + auto *N = dyn_cast_or_null(MD); + assert(N && N->isTemporary() && "Lazy loading an already loaded metadata"); + } +#endif + SmallVector Record; + StringRef Blob; + IndexCursor.JumpToBit(GlobalMetadataBitPosIndex[ID - MDStringRef.size()]); + auto Entry = IndexCursor.advanceSkippingSubblocks(); + ++NumMDRecordLoaded; + unsigned Code = IndexCursor.readRecord(Entry.ID, Record, &Blob); + if (Error Err = parseOneMetadata(Record, Code, Placeholders, Blob, ID)) + report_fatal_error("Can't lazyload MD"); +} + +/// Ensure that all forward-references and placeholders are resolved. +/// Iteratively lazy-loading metadata on-demand if needed. +void MetadataLoader::MetadataLoaderImpl::resolveForwardRefsAndPlaceholders( + PlaceholderQueue &Placeholders) { + DenseSet Temporaries; + while (1) { + // Populate Temporaries with the placeholders that haven't been loaded yet. + Placeholders.getTemporaries(MetadataList, Temporaries); + + // If we don't have any temporary, or FwdReference, we're done! + if (Temporaries.empty() && !MetadataList.hasFwdRefs()) + break; + + // First, load all the temporaries. This can add new placeholders or + // forward references. + for (auto ID : Temporaries) + lazyLoadOneMetadata(ID, Placeholders); + Temporaries.clear(); + + // Second, load the forward-references. This can also add new placeholders + // or forward references. + while (MetadataList.hasFwdRefs()) + lazyLoadOneMetadata(MetadataList.getNextFwdRef(), Placeholders); + } + // At this point we don't have any forward reference remaining, or temporary + // that haven't been loaded. We can safely drop RAUW support and mark cycles + // as resolved. + MetadataList.tryToResolveCycles(); + + // Finally, everything is in place, we can replace the placeholders operands + // with the final node they refer to. + Placeholders.flush(MetadataList); +} + Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( SmallVectorImpl &Record, unsigned Code, - PlaceholderQueue &Placeholders, StringRef Blob, bool ModuleLevel, - unsigned &NextMetadataNo) { + PlaceholderQueue &Placeholders, StringRef Blob, unsigned &NextMetadataNo) { bool IsDistinct = false; auto getMD = [&](unsigned ID) -> Metadata * { + if (ID < MDStringRef.size()) + return lazyLoadOneMDString(ID); if (!IsDistinct) return MetadataList.getMetadataFwdRef(ID); if (auto *MD = MetadataList.getMetadataIfResolved(ID)) @@ -519,7 +846,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( auto getMDString = [&](unsigned ID) -> MDString * { // This requires that the ID is not really a forward reference. In // particular, the MDString must already have been resolved. - return cast_or_null(getMDOrNull(ID)); + auto MDS = getMDOrNull(ID); + return cast_or_null(MDS); }; // Support for old type refs. @@ -539,6 +867,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( Record.clear(); Code = Stream.ReadCode(); + ++NumMDRecordLoaded; unsigned NextBitCode = Stream.readRecord(Code, Record); if (NextBitCode != bitc::METADATA_NAMED_NODE) return error("METADATA_NAME not followed by METADATA_NAMED_NODE"); @@ -1137,15 +1466,20 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( // Test for upgrading !llvm.loop. HasSeenOldLoopTags |= mayBeOldLoopAttachmentTag(String); - + ++NumMDStringLoaded; Metadata *MD = MDString::get(Context, String); MetadataList.assignValue(MD, NextMetadataNo++); break; } - case bitc::METADATA_STRINGS: - if (Error Err = parseMetadataStrings(Record, Blob, NextMetadataNo)) + case bitc::METADATA_STRINGS: { + auto CreateNextMDString = [&](StringRef Str) { + ++NumMDStringLoaded; + MetadataList.assignValue(MDString::get(Context, Str), NextMetadataNo++); + }; + if (Error Err = parseMetadataStrings(Record, Blob, CreateNextMDString)) return Err; break; + } case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: { if (Record.size() % 2 == 0) return error("Invalid record"); @@ -1166,12 +1500,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( break; } } -#undef GET_OR_DISTINCT return Error::success(); +#undef GET_OR_DISTINCT } Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings( - ArrayRef Record, StringRef Blob, unsigned &NextMetadataNo) { + ArrayRef Record, StringRef Blob, + std::function CallBack) { // All the MDStrings in the block are emitted together in a single // record. The strings are concatenated and stored in a blob along with // their sizes. @@ -1197,8 +1532,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings( if (Strings.size() < Size) return error("Invalid record: metadata strings truncated chars"); - MetadataList.assignValue(MDString::get(Context, Strings.slice(0, Size)), - NextMetadataNo++); + CallBack(Strings.slice(0, Size)); Strings = Strings.drop_front(Size); } while (--NumStrings); @@ -1228,6 +1562,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment( SmallVector Record; + PlaceholderQueue Placeholders; + while (true) { BitstreamEntry Entry = Stream.advanceSkippingSubblocks(); @@ -1236,6 +1572,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment( case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: + resolveForwardRefsAndPlaceholders(Placeholders); return Error::success(); case BitstreamEntry::Record: // The interesting case. @@ -1244,6 +1581,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment( // Read a metadata attachment record. Record.clear(); + ++NumMDRecordLoaded; switch (Stream.readRecord(Entry.ID, Record)) { default: // Default behavior: ignore. break; @@ -1268,7 +1606,14 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment( if (I->second == LLVMContext::MD_tbaa && StripTBAA) continue; - Metadata *Node = MetadataList.getMetadataFwdRef(Record[i + 1]); + auto Idx = Record[i + 1]; + if (Idx < (MDStringRef.size() + GlobalMetadataBitPosIndex.size()) && + !MetadataList.lookup(Idx)) + // Load the attachment if it is in the lazy-loadable range and hasn't + // been loaded yet. + lazyLoadOneMetadata(Idx, Placeholders); + + Metadata *Node = MetadataList.getMetadataFwdRef(Idx); if (isa(Node)) // Drop the attachment. This used to be legal, but there's no // upgrade path. @@ -1331,6 +1676,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataKinds() { // Read a record. Record.clear(); + ++NumMDRecordLoaded; unsigned Code = Stream.readRecord(Entry.ID, Record); switch (Code) { default: // Default behavior: ignore. diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index c10ba2399e7..ebb2022551f 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -784,53 +784,53 @@ void ModuleBitcodeWriter::writeTypeTable() { uint64_t NumBits = VE.computeBitsRequiredForTypeIndicies(); // Abbrev for TYPE_CODE_POINTER. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_POINTER)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits)); Abbv->Add(BitCodeAbbrevOp(0)); // Addrspace = 0 - unsigned PtrAbbrev = Stream.EmitAbbrev(Abbv); + unsigned PtrAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for TYPE_CODE_FUNCTION. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_FUNCTION)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isvararg Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits)); - unsigned FunctionAbbrev = Stream.EmitAbbrev(Abbv); + unsigned FunctionAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for TYPE_CODE_STRUCT_ANON. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_ANON)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ispacked Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits)); - unsigned StructAnonAbbrev = Stream.EmitAbbrev(Abbv); + unsigned StructAnonAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for TYPE_CODE_STRUCT_NAME. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAME)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6)); - unsigned StructNameAbbrev = Stream.EmitAbbrev(Abbv); + unsigned StructNameAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for TYPE_CODE_STRUCT_NAMED. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAMED)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ispacked Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits)); - unsigned StructNamedAbbrev = Stream.EmitAbbrev(Abbv); + unsigned StructNamedAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for TYPE_CODE_ARRAY. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_ARRAY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // size Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits)); - unsigned ArrayAbbrev = Stream.EmitAbbrev(Abbv); + unsigned ArrayAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Emit an entry count so the reader can reserve space. TypeVals.push_back(TypeList.size()); @@ -971,9 +971,8 @@ static unsigned getEncodedLinkage(const GlobalValue &GV) { static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) { uint64_t RawFlags = 0; - RawFlags |= Flags.NoRename; // bool - RawFlags |= (Flags.IsNotViableToInline << 1); - RawFlags |= (Flags.HasInlineAsmMaybeReferencingInternal << 2); + RawFlags |= Flags.NotEligibleToImport; // bool + RawFlags |= (Flags.LiveRoot << 1); // Linkage don't need to be remapped at that time for the summary. Any future // change to the getEncodedLinkage() function will need to be taken into // account here as well. @@ -1059,13 +1058,13 @@ void BitcodeWriterBase::writeValueSymbolTableForwardDecl() { // which is written after the function blocks so that it can include // the offset of each function. The placeholder offset will be // updated when the real VST is written. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_VSTOFFSET)); // Blocks are 32-bit aligned, so we can use a 32-bit word offset to // hold the real VST offset. Must use fixed instead of VBR as we don't // know how many VBR chunks to reserve ahead of time. Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); - unsigned VSTOffsetAbbrev = Stream.EmitAbbrev(Abbv); + unsigned VSTOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Emit the placeholder uint64_t Vals[] = {bitc::MODULE_CODE_VSTOFFSET, 0}; @@ -1155,7 +1154,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { unsigned SimpleGVarAbbrev = 0; if (!M.global_empty()) { // Add an abbrev for common globals with no visibility or thread localness. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_GLOBALVAR)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(MaxGlobalType+1))); @@ -1177,7 +1176,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(SectionMap.size()+1))); // Don't bother emitting vis + thread local. - SimpleGVarAbbrev = Stream.EmitAbbrev(Abbv); + SimpleGVarAbbrev = Stream.EmitAbbrev(std::move(Abbv)); } // Emit the global variable information. @@ -1285,11 +1284,11 @@ void ModuleBitcodeWriter::writeModuleInfo() { AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7); // MODULE_CODE_SOURCE_FILENAME: [namechar x N] - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_SOURCE_FILENAME)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(AbbrevOpToUse); - unsigned FilenameAbbrev = Stream.EmitAbbrev(Abbv); + unsigned FilenameAbbrev = Stream.EmitAbbrev(std::move(Abbv)); for (const auto P : M.getSourceFileName()) Vals.push_back((unsigned char)P); @@ -1360,14 +1359,14 @@ void ModuleBitcodeWriter::writeMDTuple(const MDTuple *N, unsigned ModuleBitcodeWriter::createDILocationAbbrev() { // Assume the column is usually under 128, and always output the inlined-at // location (it's never more expensive than building an array size 1). - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_LOCATION)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); - return Stream.EmitAbbrev(Abbv); + return Stream.EmitAbbrev(std::move(Abbv)); } void ModuleBitcodeWriter::writeDILocation(const DILocation *N, @@ -1389,7 +1388,7 @@ void ModuleBitcodeWriter::writeDILocation(const DILocation *N, unsigned ModuleBitcodeWriter::createGenericDINodeAbbrev() { // Assume the column is usually under 128, and always output the inlined-at // location (it's never more expensive than building an array size 1). - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_GENERIC_DEBUG)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); @@ -1397,7 +1396,7 @@ unsigned ModuleBitcodeWriter::createGenericDINodeAbbrev() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); - return Stream.EmitAbbrev(Abbv); + return Stream.EmitAbbrev(std::move(Abbv)); } void ModuleBitcodeWriter::writeGenericDINode(const GenericDINode *N, @@ -1790,11 +1789,11 @@ void ModuleBitcodeWriter::writeDIImportedEntity( } unsigned ModuleBitcodeWriter::createNamedMetadataAbbrev() { - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_NAME)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); - return Stream.EmitAbbrev(Abbv); + return Stream.EmitAbbrev(std::move(Abbv)); } void ModuleBitcodeWriter::writeNamedMetadata( @@ -1819,12 +1818,12 @@ void ModuleBitcodeWriter::writeNamedMetadata( } unsigned ModuleBitcodeWriter::createMetadataStringsAbbrev() { - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRINGS)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // # of strings Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // offset to chars Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); - return Stream.EmitAbbrev(Abbv); + return Stream.EmitAbbrev(std::move(Abbv)); } /// Write out a record for MDString. @@ -1918,17 +1917,17 @@ void ModuleBitcodeWriter::writeModuleMetadata() { MDAbbrevs[MetadataAbbrev::GenericDINodeAbbrevID] = createGenericDINodeAbbrev(); - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_INDEX_OFFSET)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); - unsigned OffsetAbbrev = Stream.EmitAbbrev(Abbv); + unsigned OffsetAbbrev = Stream.EmitAbbrev(std::move(Abbv)); - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_INDEX)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); - unsigned IndexAbbrev = Stream.EmitAbbrev(Abbv); + unsigned IndexAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Emit MDStrings together upfront. writeMetadataStrings(VE.getMDStrings(), Record); @@ -2125,30 +2124,30 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, // If this is a constant pool for the module, emit module-specific abbrevs. if (isGlobal) { // Abbrev for CST_CODE_AGGREGATE. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_AGGREGATE)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(LastVal+1))); - AggregateAbbrev = Stream.EmitAbbrev(Abbv); + AggregateAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for CST_CODE_STRING. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_STRING)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); - String8Abbrev = Stream.EmitAbbrev(Abbv); + String8Abbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for CST_CODE_CSTRING. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); - CString7Abbrev = Stream.EmitAbbrev(Abbv); + CString7Abbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for CST_CODE_CSTRING. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6)); - CString6Abbrev = Stream.EmitAbbrev(Abbv); + CString6Abbrev = Stream.EmitAbbrev(std::move(Abbv)); } SmallVector Record; @@ -2858,39 +2857,39 @@ void ModuleBitcodeWriter::writeValueSymbolTable( unsigned GUIDEntryAbbrev; if (IsModuleLevel && hasVSTOffsetPlaceholder()) { // 8-bit fixed-width VST_CODE_FNENTRY function strings. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); - FnEntry8BitAbbrev = Stream.EmitAbbrev(Abbv); + FnEntry8BitAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // 7-bit fixed width VST_CODE_FNENTRY function strings. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); - FnEntry7BitAbbrev = Stream.EmitAbbrev(Abbv); + FnEntry7BitAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // 6-bit char6 VST_CODE_FNENTRY function strings. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6)); - FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv); + FnEntry6BitAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // FIXME: Change the name of this record as it is now used by // the per-module index as well. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid - GUIDEntryAbbrev = Stream.EmitAbbrev(Abbv); + GUIDEntryAbbrev = Stream.EmitAbbrev(std::move(Abbv)); } // FIXME: Set up the abbrev, we know how many values there are! @@ -2984,11 +2983,11 @@ void IndexBitcodeWriter::writeCombinedValueSymbolTable() { Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4); - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid - unsigned EntryAbbrev = Stream.EmitAbbrev(Abbv); + unsigned EntryAbbrev = Stream.EmitAbbrev(std::move(Abbv)); SmallVector NameVals; for (const auto &GVI : valueIds()) { @@ -3121,7 +3120,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { Stream.EnterBlockInfoBlock(); { // 8-bit fixed-width VST_CODE_ENTRY/VST_CODE_BBENTRY strings. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); @@ -3132,7 +3131,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { } { // 7-bit fixed width VST_CODE_ENTRY strings. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); @@ -3142,7 +3141,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { llvm_unreachable("Unexpected abbrev ordering!"); } { // 6-bit char6 VST_CODE_ENTRY strings. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); @@ -3152,7 +3151,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { llvm_unreachable("Unexpected abbrev ordering!"); } { // 6-bit char6 VST_CODE_BBENTRY strings. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_BBENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); @@ -3165,7 +3164,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { { // SETTYPE abbrev for CONSTANTS_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, VE.computeBitsRequiredForTypeIndicies())); @@ -3175,7 +3174,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { } { // INTEGER abbrev for CONSTANTS_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_INTEGER)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) != @@ -3184,7 +3183,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { } { // CE_CAST abbrev for CONSTANTS_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // cast opc Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // typeid @@ -3196,7 +3195,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { llvm_unreachable("Unexpected abbrev ordering!"); } { // NULL abbrev for CONSTANTS_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_NULL)); if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) != CONSTANTS_NULL_Abbrev) @@ -3206,7 +3205,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { // FIXME: This should only use space for first class types! { // INST_LOAD abbrev for FUNCTION_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // dest ty @@ -3218,7 +3217,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { llvm_unreachable("Unexpected abbrev ordering!"); } { // INST_BINOP abbrev for FUNCTION_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS @@ -3228,7 +3227,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { llvm_unreachable("Unexpected abbrev ordering!"); } { // INST_BINOP_FLAGS abbrev for FUNCTION_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS @@ -3239,7 +3238,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { llvm_unreachable("Unexpected abbrev ordering!"); } { // INST_CAST abbrev for FUNCTION_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpVal Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // dest ty @@ -3251,14 +3250,14 @@ void ModuleBitcodeWriter::writeBlockInfo() { } { // INST_RET abbrev for FUNCTION_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET)); if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != FUNCTION_INST_RET_VOID_ABBREV) llvm_unreachable("Unexpected abbrev ordering!"); } { // INST_RET abbrev for FUNCTION_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ValID if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != @@ -3266,14 +3265,14 @@ void ModuleBitcodeWriter::writeBlockInfo() { llvm_unreachable("Unexpected abbrev ordering!"); } { // INST_UNREACHABLE abbrev for FUNCTION_BLOCK. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNREACHABLE)); if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != FUNCTION_INST_UNREACHABLE_ABBREV) llvm_unreachable("Unexpected abbrev ordering!"); } { - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_GEP)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // dest ty @@ -3296,38 +3295,38 @@ void IndexBitcodeWriter::writeModStrings() { // TODO: See which abbrev sizes we actually need to emit // 8-bit fixed-width MST_ENTRY strings. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); - unsigned Abbrev8Bit = Stream.EmitAbbrev(Abbv); + unsigned Abbrev8Bit = Stream.EmitAbbrev(std::move(Abbv)); // 7-bit fixed width MST_ENTRY strings. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); - unsigned Abbrev7Bit = Stream.EmitAbbrev(Abbv); + unsigned Abbrev7Bit = Stream.EmitAbbrev(std::move(Abbv)); // 6-bit char6 MST_ENTRY strings. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6)); - unsigned Abbrev6Bit = Stream.EmitAbbrev(Abbv); + unsigned Abbrev6Bit = Stream.EmitAbbrev(std::move(Abbv)); // Module Hash, 160 bits SHA1. Optionally, emitted after each MST_CODE_ENTRY. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_HASH)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); - unsigned AbbrevHash = Stream.EmitAbbrev(Abbv); + unsigned AbbrevHash = Stream.EmitAbbrev(std::move(Abbv)); SmallVector Vals; for (const auto &MPSE : Index.modulePaths()) { @@ -3435,7 +3434,7 @@ void ModuleBitcodeWriter::writeModuleLevelReferences( // Current version for the summary. // This is bumped whenever we introduce changes in the way some record are // interpreted, like flags for instance. -static const uint64_t INDEX_VERSION = 2; +static const uint64_t INDEX_VERSION = 3; /// Emit the per-module summary section alongside the rest of /// the module's bitcode. @@ -3450,7 +3449,7 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() { } // Abbrev for FS_PERMODULE. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags @@ -3459,10 +3458,10 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() { // numrefs x valueid, n x (valueid) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); - unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv); + unsigned FSCallsAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for FS_PERMODULE_PROFILE. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags @@ -3471,24 +3470,24 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() { // numrefs x valueid, n x (valueid, hotness) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); - unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv); + unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for FS_PERMODULE_GLOBALVAR_INIT_REFS. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_GLOBALVAR_INIT_REFS)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); // valueids Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); - unsigned FSModRefsAbbrev = Stream.EmitAbbrev(Abbv); + unsigned FSModRefsAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for FS_ALIAS. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_ALIAS)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid - unsigned FSAliasAbbrev = Stream.EmitAbbrev(Abbv); + unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv)); SmallVector NameVals; // Iterate over the list of functions instead of the Index to @@ -3542,7 +3541,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { Stream.EmitRecord(bitc::FS_VERSION, ArrayRef{INDEX_VERSION}); // Abbrev for FS_COMBINED. - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid @@ -3552,10 +3551,10 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // numrefs x valueid, n x (valueid) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); - unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv); + unsigned FSCallsAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for FS_COMBINED_PROFILE. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_PROFILE)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid @@ -3565,26 +3564,26 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // numrefs x valueid, n x (valueid, hotness) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); - unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv); + unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for FS_COMBINED_GLOBALVAR_INIT_REFS. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_GLOBALVAR_INIT_REFS)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); // valueids Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); - unsigned FSModRefsAbbrev = Stream.EmitAbbrev(Abbv); + unsigned FSModRefsAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // Abbrev for FS_COMBINED_ALIAS. - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALIAS)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid - unsigned FSAliasAbbrev = Stream.EmitAbbrev(Abbv); + unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv)); // The aliases are emitted as a post-pass, and will point to the value // id of the aliasee. Save them in a vector for post-processing. @@ -3702,19 +3701,19 @@ void writeIdentificationBlock(BitstreamWriter &Stream) { Stream.EnterSubblock(bitc::IDENTIFICATION_BLOCK_ID, 5); // Write the "user readable" string identifying the bitcode producer - BitCodeAbbrev *Abbv = new BitCodeAbbrev(); + auto Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_STRING)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6)); - auto StringAbbrev = Stream.EmitAbbrev(Abbv); + auto StringAbbrev = Stream.EmitAbbrev(std::move(Abbv)); writeStringRecord(Stream, bitc::IDENTIFICATION_CODE_STRING, "LLVM" LLVM_VERSION_STRING, StringAbbrev); // Write the epoch version - Abbv = new BitCodeAbbrev(); + Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_EPOCH)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); - auto EpochAbbrev = Stream.EmitAbbrev(Abbv); + auto EpochAbbrev = Stream.EmitAbbrev(std::move(Abbv)); SmallVector Vals = {bitc::BITCODE_CURRENT_EPOCH}; Stream.EmitRecord(bitc::IDENTIFICATION_CODE_EPOCH, Vals, EpochAbbrev); Stream.ExitBlock(); diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp index 0c79def8793..61149d9229b 100644 --- a/lib/CodeGen/AsmPrinter/ARMException.cpp +++ b/lib/CodeGen/AsmPrinter/ARMException.cpp @@ -53,7 +53,8 @@ void ARMException::beginFunction(const MachineFunction *MF) { if (MoveType == AsmPrinter::CFI_M_Debug) { if (!hasEmittedCFISections) { - Asm->OutStreamer->EmitCFISections(false, true); + if (Asm->needsOnlyDebugCFIMoves()) + Asm->OutStreamer->EmitCFISections(false, true); hasEmittedCFISections = true; } diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 5f15ac1d503..9f6caa95a9e 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -108,7 +108,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL, AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr Streamer) : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()), OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)), - LastMI(nullptr), LastFn(0), Counter(~0U) { + isCFIMoveForDebugging(false), LastMI(nullptr), LastFn(0), Counter(~0U) { DD = nullptr; MMI = nullptr; LI = nullptr; @@ -264,6 +264,28 @@ bool AsmPrinter::doInitialization(Module &M) { } } + switch (MAI->getExceptionHandlingType()) { + case ExceptionHandling::SjLj: + case ExceptionHandling::DwarfCFI: + case ExceptionHandling::ARM: + isCFIMoveForDebugging = true; + if (MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI) + break; + for (auto &F: M.getFunctionList()) { + // If the module contains any function with unwind data, + // .eh_frame has to be emitted. + // Ignore functions that won't get emitted. + if (!F.isDeclarationForLinker() && F.needsUnwindTableEntry()) { + isCFIMoveForDebugging = false; + break; + } + } + break; + default: + isCFIMoveForDebugging = false; + break; + } + EHStreamer *ES = nullptr; switch (MAI->getExceptionHandlingType()) { case ExceptionHandling::None: diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index 20075e41977..57864e4e4d4 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -100,6 +100,8 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, } SourceMgr SrcMgr; + SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths); + SrcMgrDiagInfo DiagInfo; // If the current LLVMContext has an inline asm handler, set it in SourceMgr. diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp index ef30e279aed..e08306b001f 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp @@ -137,7 +137,7 @@ void DwarfCFIException::beginFragment(const MachineBasicBlock *MBB, return; if (!hasEmittedCFISections) { - if (Asm->needsCFIMoves() == AsmPrinter::CFI_M_Debug) + if (Asm->needsOnlyDebugCFIMoves()) Asm->OutStreamer->EmitCFISections(false, true); hasEmittedCFISections = true; } diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp index cf35afbc6e5..89a042ffc47 100644 --- a/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -125,8 +125,11 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) { MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) { MachineBasicBlock *&MBB = BBToMBB[&BB]; if (!MBB) { - MBB = MF->CreateMachineBasicBlock(); + MBB = MF->CreateMachineBasicBlock(&BB); MF->push_back(MBB); + + if (BB.hasAddressTaken()) + MBB->setHasAddressTaken(); } return *MBB; } @@ -195,6 +198,45 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) { return true; } +bool IRTranslator::translateSwitch(const User &U, + MachineIRBuilder &MIRBuilder) { + // For now, just translate as a chain of conditional branches. + // FIXME: could we share most of the logic/code in + // SelectionDAGBuilder::visitSwitch between SelectionDAG and GlobalISel? + // At first sight, it seems most of the logic in there is independent of + // SelectionDAG-specifics and a lot of work went in to optimize switch + // lowering in there. + + const SwitchInst &SwInst = cast(U); + const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition()); + + LLT LLTi1 = LLT(*Type::getInt1Ty(U.getContext()), *DL); + for (auto &CaseIt : SwInst.cases()) { + const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue()); + const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1); + MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue); + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + MachineBasicBlock &TrueBB = getOrCreateBB(*CaseIt.getCaseSuccessor()); + + MIRBuilder.buildBrCond(Tst, TrueBB); + CurBB.addSuccessor(&TrueBB); + + MachineBasicBlock *FalseBB = + MF->CreateMachineBasicBlock(SwInst.getParent()); + MF->push_back(FalseBB); + MIRBuilder.buildBr(*FalseBB); + CurBB.addSuccessor(FalseBB); + + MIRBuilder.setMBB(*FalseBB); + } + // handle default case + MachineBasicBlock &DefaultBB = getOrCreateBB(*SwInst.getDefaultDest()); + MIRBuilder.buildBr(DefaultBB); + MIRBuilder.getMBB().addSuccessor(&DefaultBB); + + return true; +} + bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { const LoadInst &LI = cast(U); diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp index a6c93bc0f3d..7d405dd92ac 100644 --- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp +++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp @@ -55,11 +55,10 @@ const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1; RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks, unsigned NumRegBanks) : RegBanks(RegBanks), NumRegBanks(NumRegBanks) { - DEBUG(for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) { +#ifndef NDEBUG + for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank"); - assert(!RegBanks[Idx]->isValid() && - "RegisterBank should be invalid before initialization"); - }); +#endif // NDEBUG } RegisterBankInfo::~RegisterBankInfo() { @@ -70,13 +69,15 @@ RegisterBankInfo::~RegisterBankInfo() { } bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const { - DEBUG(for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) { +#ifndef NDEBUG + for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) { const RegisterBank &RegBank = getRegBank(Idx); assert(Idx == RegBank.getID() && "ID does not match the index in the array"); dbgs() << "Verify " << RegBank << '\n'; assert(RegBank.verify(TRI) && "RegBank is invalid"); - }); + } +#endif // NDEBUG return true; } diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index 0cac7b71e24..b9f3d86eabd 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -1495,16 +1495,18 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) { if (TII->reverseBranchCondition(Cond)) llvm_unreachable("Unable to reverse branch condition!"); - // Initialize liveins to the first BB. These are potentiall redefined by - // predicated instructions. Redefs.init(*TRI); - Redefs.addLiveIns(CvtMBB); - Redefs.addLiveIns(NextMBB); - - // Compute a set of registers which must not be killed by instructions in - // BB1: This is everything live-in to BB2. DontKill.init(*TRI); - DontKill.addLiveIns(NextMBB); + + if (MRI->tracksLiveness()) { + // Initialize liveins to the first BB. These are potentiall redefined by + // predicated instructions. + Redefs.addLiveIns(CvtMBB); + Redefs.addLiveIns(NextMBB); + // Compute a set of registers which must not be killed by instructions in + // BB1: This is everything live-in to BB2. + DontKill.addLiveIns(NextMBB); + } if (CvtMBB.pred_size() > 1) { BBI.NonPredSize -= TII->removeBranch(*BBI.BB); @@ -1602,8 +1604,10 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { // Initialize liveins to the first BB. These are potentially redefined by // predicated instructions. Redefs.init(*TRI); - Redefs.addLiveIns(CvtMBB); - Redefs.addLiveIns(NextMBB); + if (MRI->tracksLiveness()) { + Redefs.addLiveIns(CvtMBB); + Redefs.addLiveIns(NextMBB); + } DontKill.clear(); @@ -1766,8 +1770,10 @@ bool IfConverter::IfConvertDiamondCommon( // instructions. We start with BB1 live-ins so we have the live-out regs // after tracking the BB1 instructions. Redefs.init(*TRI); - Redefs.addLiveIns(MBB1); - Redefs.addLiveIns(MBB2); + if (MRI->tracksLiveness()) { + Redefs.addLiveIns(MBB1); + Redefs.addLiveIns(MBB2); + } // Remove the duplicated instructions at the beginnings of both paths. // Skip dbg_value instructions @@ -1792,12 +1798,14 @@ bool IfConverter::IfConvertDiamondCommon( // This is everything used+live in BB2 after the duplicated instructions. We // can compute this set by simulating liveness backwards from the end of BB2. DontKill.init(*TRI); - for (const MachineInstr &MI : make_range(MBB2.rbegin(), ++DI2.getReverse())) - DontKill.stepBackward(MI); + if (MRI->tracksLiveness()) { + for (const MachineInstr &MI : make_range(MBB2.rbegin(), ++DI2.getReverse())) + DontKill.stepBackward(MI); - for (const MachineInstr &MI : make_range(MBB1.begin(), DI1)) { - SmallVector, 4> IgnoredClobbers; - Redefs.stepForward(MI, IgnoredClobbers); + for (const MachineInstr &MI : make_range(MBB1.begin(), DI1)) { + SmallVector, 4> Dummy; + Redefs.stepForward(MI, Dummy); + } } BBI.BB->splice(BBI.BB->end(), &MBB1, MBB1.begin(), DI1); MBB2.erase(MBB2.begin(), DI2); diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index eb13d2d3ec0..db87092177c 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -488,16 +488,16 @@ void MIPrinter::print(const MachineBasicBlock &MBB) { } // Print the live in registers. - const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); - assert(TRI && "Expected target register info"); - if (!MBB.livein_empty()) { + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + if (MRI.tracksLiveness() && !MBB.livein_empty()) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); OS.indent(2) << "liveins: "; bool First = true; for (const auto &LI : MBB.liveins()) { if (!First) OS << ", "; First = false; - printReg(LI.PhysReg, OS, TRI); + printReg(LI.PhysReg, OS, &TRI); if (!LI.LaneMask.all()) OS << ":0x" << PrintLaneMask(LI.LaneMask); } diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 549424d257f..3869f976854 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -286,7 +286,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, if (!livein_empty()) { if (Indexes) OS << '\t'; OS << " Live Ins:"; - for (const auto &LI : make_range(livein_begin(), livein_end())) { + for (const auto &LI : LiveIns) { OS << ' ' << PrintReg(LI.PhysReg, TRI); if (!LI.LaneMask.all()) OS << ':' << PrintLaneMask(LI.LaneMask); @@ -1292,3 +1292,10 @@ MachineBasicBlock::getEndClobberMask(const TargetRegisterInfo *TRI) const { void MachineBasicBlock::clearLiveIns() { LiveIns.clear(); } + +MachineBasicBlock::livein_iterator MachineBasicBlock::livein_begin() const { + assert(getParent()->getProperties().hasProperty( + MachineFunctionProperties::Property::TracksLiveness) && + "Liveness information is accurate"); + return LiveIns.begin(); +} diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index 426a4666c64..a98139f9e5a 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -566,7 +566,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { FirstTerminator = nullptr; if (!MF->getProperties().hasProperty( - MachineFunctionProperties::Property::NoPHIs)) { + MachineFunctionProperties::Property::NoPHIs) && MRI->tracksLiveness()) { // If this block has allocatable physical registers live-in, check that // it is an entry block or landing pad. for (const auto &LI : MBB->liveins()) { @@ -741,14 +741,16 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { } regsLive.clear(); - for (const auto &LI : MBB->liveins()) { - if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) { - report("MBB live-in list contains non-physical register", MBB); - continue; + if (MRI->tracksLiveness()) { + for (const auto &LI : MBB->liveins()) { + if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) { + report("MBB live-in list contains non-physical register", MBB); + continue; + } + for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true); + SubRegs.isValid(); ++SubRegs) + regsLive.insert(*SubRegs); } - for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - regsLive.insert(*SubRegs); } regsLiveInButUnused = regsLive; diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp index de1c35caa1a..fdf741fd58f 100644 --- a/lib/CodeGen/RegisterScavenging.cpp +++ b/lib/CodeGen/RegisterScavenging.cpp @@ -48,11 +48,6 @@ void RegScavenger::init(MachineBasicBlock &MBB) { assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) && "Target changed?"); - // It is not possible to use the register scavenger after late optimization - // passes that don't preserve accurate liveness information. - assert(MRI->tracksLiveness() && - "Cannot use register scavenger with inaccurate liveness"); - // Self-initialize. if (!this->MBB) { NumRegUnits = TRI->getNumRegUnits(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index cb803585282..a07bd8f8354 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -7339,19 +7340,23 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG, if (!Range) return Op; - Constant *Lo = cast(Range->getOperand(0))->getValue(); - if (!Lo->isNullValue()) + ConstantRange CR = getConstantRangeFromMetadata(*Range); + if (CR.isFullSet() || CR.isEmptySet() || CR.isWrappedSet()) return Op; - Constant *Hi = cast(Range->getOperand(1))->getValue(); - unsigned Bits = cast(Hi)->getValue().logBase2(); + APInt Lo = CR.getUnsignedMin(); + if (!Lo.isMinValue()) + return Op; + + APInt Hi = CR.getUnsignedMax(); + unsigned Bits = Hi.getActiveBits(); EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits); SDLoc SL = getCurSDLoc(); - SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(), - Op, DAG.getValueType(SmallVT)); + SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(), Op, + DAG.getValueType(SmallVT)); unsigned NumVals = Op.getNode()->getNumValues(); if (NumVals == 1) return ZExt; diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp index deec1633022..2aac3474654 100644 --- a/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -299,11 +299,8 @@ DWARFDie::collectChildrenAddressRanges(DWARFAddressRangesVector& Ranges) const { Ranges.insert(Ranges.end(), DIERanges.begin(), DIERanges.end()); } - DWARFDie Child = getFirstChild(); - while (Child) { + for (auto Child: children()) Child.collectChildrenAddressRanges(Ranges); - Child = Child.getSibling(); - } } bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const { diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp index e6c9764f113..2bbcb25275e 100644 --- a/lib/Fuzzer/FuzzerDriver.cpp +++ b/lib/Fuzzer/FuzzerDriver.cpp @@ -468,6 +468,7 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) { Options.HandleInt = Flags.handle_int; Options.HandleSegv = Flags.handle_segv; Options.HandleTerm = Flags.handle_term; + Options.HandleXfsz = Flags.handle_xfsz; SetSignalHandler(Options); if (Flags.minimize_crash_internal_step) diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def index 08eaad9856b..22aad353ace 100644 --- a/lib/Fuzzer/FuzzerFlags.def +++ b/lib/Fuzzer/FuzzerFlags.def @@ -91,6 +91,7 @@ FUZZER_FLAG_INT(handle_ill, 1, "If 1, try to intercept SIGILL.") FUZZER_FLAG_INT(handle_fpe, 1, "If 1, try to intercept SIGFPE.") FUZZER_FLAG_INT(handle_int, 1, "If 1, try to intercept SIGINT.") FUZZER_FLAG_INT(handle_term, 1, "If 1, try to intercept SIGTERM.") +FUZZER_FLAG_INT(handle_xfsz, 1, "If 1, try to intercept SIGXFSZ.") FUZZER_FLAG_INT(close_fd_mask, 0, "If 1, close stdout at startup; " "if 2, close stderr; if 3, close both. " "Be careful, this will also close e.g. asan's stderr/stdout.") diff --git a/lib/Fuzzer/FuzzerIO.h b/lib/Fuzzer/FuzzerIO.h index 741fecf415b..15bfd3d3472 100644 --- a/lib/Fuzzer/FuzzerIO.h +++ b/lib/Fuzzer/FuzzerIO.h @@ -37,6 +37,9 @@ std::string DirPlusFile(const std::string &DirPath, // Returns the name of the dir, similar to the 'dirname' utility. std::string DirName(const std::string &FileName); +// Returns path to a TmpDir. +std::string TmpDir(); + void DupAndCloseStderr(); void CloseStdout(); diff --git a/lib/Fuzzer/FuzzerIOPosix.cpp b/lib/Fuzzer/FuzzerIOPosix.cpp index 720bc130459..6d8edf6ff53 100644 --- a/lib/Fuzzer/FuzzerIOPosix.cpp +++ b/lib/Fuzzer/FuzzerIOPosix.cpp @@ -83,6 +83,12 @@ std::string DirName(const std::string &FileName) { return Res; } +std::string TmpDir() { + if (auto Env = getenv("TMPDIR")) + return Env; + return "/tmp"; +} + } // namespace fuzzer #endif // LIBFUZZER_POSIX diff --git a/lib/Fuzzer/FuzzerIOWindows.cpp b/lib/Fuzzer/FuzzerIOWindows.cpp index a4738eb9dfe..056f0721a33 100644 --- a/lib/Fuzzer/FuzzerIOWindows.cpp +++ b/lib/Fuzzer/FuzzerIOWindows.cpp @@ -277,6 +277,8 @@ std::string DirName(const std::string &FileName) { return FileName.substr(0, LocationLen + DirLen); } +std::string TmpDir() { return "TODO: implement TmpDir"; } + } // namespace fuzzer #endif // LIBFUZZER_WINDOWS diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h index c041706092d..0d2c7a78aca 100644 --- a/lib/Fuzzer/FuzzerInternal.h +++ b/lib/Fuzzer/FuzzerInternal.h @@ -82,6 +82,7 @@ class Fuzzer { static void StaticAlarmCallback(); static void StaticCrashSignalCallback(); static void StaticInterruptCallback(); + static void StaticFileSizeExceedCallback(); void ExecuteCallback(const uint8_t *Data, size_t Size); size_t RunOne(const uint8_t *Data, size_t Size); diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp index 1336f5e4aee..9f49d155799 100644 --- a/lib/Fuzzer/FuzzerLoop.cpp +++ b/lib/Fuzzer/FuzzerLoop.cpp @@ -266,6 +266,11 @@ void Fuzzer::StaticInterruptCallback() { F->InterruptCallback(); } +void Fuzzer::StaticFileSizeExceedCallback() { + Printf("==%lu== ERROR: libFuzzer: file size exceeded\n", GetPid()); + exit(1); +} + void Fuzzer::CrashCallback() { Printf("==%lu== ERROR: libFuzzer: deadly signal\n", GetPid()); if (EF->__sanitizer_print_stack_trace) diff --git a/lib/Fuzzer/FuzzerMerge.cpp b/lib/Fuzzer/FuzzerMerge.cpp index 84660e0fe53..9e559115680 100644 --- a/lib/Fuzzer/FuzzerMerge.cpp +++ b/lib/Fuzzer/FuzzerMerge.cpp @@ -220,8 +220,8 @@ void Fuzzer::CrashResistantMerge(const std::vector &Args, ListFilesInDirRecursive(Corpora[i], nullptr, &AllFiles, /*TopDir*/true); Printf("MERGE-OUTER: %zd files, %zd in the initial corpus\n", AllFiles.size(), NumFilesInFirstCorpus); - std::string CFPath = - "libFuzzerTemp." + std::to_string(GetPid()) + ".txt"; + auto CFPath = DirPlusFile(TmpDir(), + "libFuzzerTemp." + std::to_string(GetPid()) + ".txt"); // Write the control file. RemoveFile(CFPath); std::ofstream ControlFile(CFPath); @@ -229,6 +229,11 @@ void Fuzzer::CrashResistantMerge(const std::vector &Args, ControlFile << NumFilesInFirstCorpus << "\n"; for (auto &Path: AllFiles) ControlFile << Path << "\n"; + if (!ControlFile) { + Printf("MERGE-OUTER: failed to write to the control file: %s\n", + CFPath.c_str()); + exit(1); + } ControlFile.close(); // Execute the inner process untill it passes. @@ -246,6 +251,9 @@ void Fuzzer::CrashResistantMerge(const std::vector &Args, // Read the control file and do the merge. Merger M; std::ifstream IF(CFPath); + IF.seekg(0, IF.end); + Printf("MERGE-OUTER: the control file has %zd bytes\n", (size_t)IF.tellg()); + IF.seekg(0, IF.beg); M.ParseOrExit(IF, true); IF.close(); std::vector NewFiles; diff --git a/lib/Fuzzer/FuzzerOptions.h b/lib/Fuzzer/FuzzerOptions.h index cb702d28520..6f72205600b 100644 --- a/lib/Fuzzer/FuzzerOptions.h +++ b/lib/Fuzzer/FuzzerOptions.h @@ -62,6 +62,7 @@ struct FuzzingOptions { bool HandleInt = false; bool HandleSegv = false; bool HandleTerm = false; + bool HandleXfsz = false; }; } // namespace fuzzer diff --git a/lib/Fuzzer/FuzzerTraceState.cpp b/lib/Fuzzer/FuzzerTraceState.cpp index be62a6624b2..2ad9702fab0 100644 --- a/lib/Fuzzer/FuzzerTraceState.cpp +++ b/lib/Fuzzer/FuzzerTraceState.cpp @@ -46,10 +46,6 @@ class TraceState { void TraceMemcmpCallback(size_t CmpSize, const uint8_t *Data1, const uint8_t *Data2); - void TraceSwitchCallback(uintptr_t PC, size_t ValSizeInBits, uint64_t Val, - size_t NumCases, uint64_t *Cases); - int TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData, - size_t DataSize); int TryToAddDesiredData(const uint8_t *PresentData, const uint8_t *DesiredData, size_t DataSize); @@ -147,29 +143,6 @@ class TraceState { size_t AutoDictAdds = 0; }; -int TraceState::TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData, - size_t DataSize) { - if (NumMutations >= kMaxMutations || !WantToHandleOneMoreMutation()) return 0; - ScopedDoingMyOwnMemmem scoped_doing_my_own_memmem; - const uint8_t *UnitData; - auto UnitSize = F->GetCurrentUnitInFuzzingThead(&UnitData); - int Res = 0; - const uint8_t *Beg = UnitData; - const uint8_t *End = Beg + UnitSize; - for (const uint8_t *Cur = Beg; Cur < End; Cur++) { - Cur = (uint8_t *)SearchMemory(Cur, End - Cur, &PresentData, DataSize); - if (!Cur) - break; - size_t Pos = Cur - Beg; - assert(Pos < UnitSize); - AddMutation(Pos, DataSize, DesiredData); - AddMutation(Pos, DataSize, DesiredData + 1); - AddMutation(Pos, DataSize, DesiredData - 1); - Res++; - } - return Res; -} - int TraceState::TryToAddDesiredData(const uint8_t *PresentData, const uint8_t *DesiredData, size_t DataSize) { @@ -206,26 +179,6 @@ void TraceState::TraceMemcmpCallback(size_t CmpSize, const uint8_t *Data1, } } -void TraceState::TraceSwitchCallback(uintptr_t PC, size_t ValSizeInBits, - uint64_t Val, size_t NumCases, - uint64_t *Cases) { - if (F->InFuzzingThread()) return; - size_t ValSize = ValSizeInBits / 8; - bool TryShort = IsTwoByteData(Val); - for (size_t i = 0; i < NumCases; i++) - TryShort &= IsTwoByteData(Cases[i]); - - if (Options.Verbosity >= 3) - Printf("TraceSwitch: %p %zd # %zd; TryShort %d\n", PC, Val, NumCases, - TryShort); - - for (size_t i = 0; i < NumCases; i++) { - TryToAddDesiredData(Val, Cases[i], ValSize); - if (TryShort) - TryToAddDesiredData(Val, Cases[i], 2); - } -} - static TraceState *TS; void Fuzzer::StartTraceRecording() { diff --git a/lib/Fuzzer/FuzzerUtilPosix.cpp b/lib/Fuzzer/FuzzerUtilPosix.cpp index 8b484b8effa..e8d48dc81a3 100644 --- a/lib/Fuzzer/FuzzerUtilPosix.cpp +++ b/lib/Fuzzer/FuzzerUtilPosix.cpp @@ -41,6 +41,10 @@ static void InterruptHandler(int, siginfo_t *, void *) { Fuzzer::StaticInterruptCallback(); } +static void FileSizeExceedHandler(int, siginfo_t *, void *) { + Fuzzer::StaticFileSizeExceedCallback(); +} + static void SetSigaction(int signum, void (*callback)(int, siginfo_t *, void *)) { struct sigaction sigact; @@ -80,6 +84,8 @@ void SetSignalHandler(const FuzzingOptions& Options) { SetSigaction(SIGILL, CrashHandler); if (Options.HandleFpe) SetSigaction(SIGFPE, CrashHandler); + if (Options.HandleXfsz) + SetSigaction(SIGXFSZ, FileSizeExceedHandler); } void SleepSeconds(int Seconds) { diff --git a/lib/Fuzzer/FuzzerUtilWindows.cpp b/lib/Fuzzer/FuzzerUtilWindows.cpp index 64adb7cd138..3ca1f2c8f56 100644 --- a/lib/Fuzzer/FuzzerUtilWindows.cpp +++ b/lib/Fuzzer/FuzzerUtilWindows.cpp @@ -58,6 +58,7 @@ LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo) { if (HandlerOpt->HandleFpe) Fuzzer::StaticCrashSignalCallback(); break; + // TODO: handle (Options.HandleXfsz) } return EXCEPTION_CONTINUE_SEARCH; } diff --git a/lib/Fuzzer/test/merge.test b/lib/Fuzzer/test/merge.test index 1f1810eb019..5c7d30e41ca 100644 --- a/lib/Fuzzer/test/merge.test +++ b/lib/Fuzzer/test/merge.test @@ -44,3 +44,11 @@ MERGE_WITH_CRASH: MERGE-OUTER: 3 new files # Check that we actually limit the size with max_len RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 -max_len=5 2>&1 | FileCheck %s --check-prefix=MERGE_LEN5 MERGE_LEN5: MERGE-OUTER: succesfull in 1 attempt(s) + +# Check that we honor TMPDIR +RUN: TMPDIR=DIR_DOES_NOT_EXIST not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=TMPDIR +TMPDIR: MERGE-OUTER: failed to write to the control file: DIR_DOES_NOT_EXIST/libFuzzerTemp + +# Check that we can report an error if file size exceeded +RUN: (ulimit -f 1; not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=SIGXFSZ) +SIGXFSZ: ERROR: libFuzzer: file size exceeded diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 42b3a344352..e3e2f9f806c 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -337,12 +337,21 @@ void LTO::addSymbolToGlobalRes(SmallPtrSet &Used, if (Res.Prevailing) GlobalRes.IRName = GV->getName(); } + // Set the partition to external if we know it is used elsewhere, e.g. + // it is visible to a regular object, is referenced from llvm.compiler_used, + // or was already recorded as being referenced from a different partition. if (Res.VisibleToRegularObj || (GV && Used.count(GV)) || (GlobalRes.Partition != GlobalResolution::Unknown && - GlobalRes.Partition != Partition)) + GlobalRes.Partition != Partition)) { GlobalRes.Partition = GlobalResolution::External; - else + } else + // First recorded reference, save the current partition. GlobalRes.Partition = Partition; + + // Flag as visible outside of ThinLTO if visible from a regular object or + // if this is a reference in the regular LTO partition. + GlobalRes.VisibleOutsideThinLTO |= + (Res.VisibleToRegularObj || (Partition == GlobalResolution::RegularLTO)); } static void writeToResolutionFile(raw_ostream &OS, InputFile *Input, @@ -848,6 +857,19 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, if (!ModuleToDefinedGVSummaries.count(Mod.first)) ModuleToDefinedGVSummaries.try_emplace(Mod.first); + // Compute "dead" symbols, we don't want to import/export these! + DenseSet GUIDPreservedSymbols; + for (auto &Res : GlobalResolutions) { + if (Res.second.VisibleOutsideThinLTO && + // IRName will be defined if we have seen the prevailing copy of + // this value. If not, no need to preserve any ThinLTO copies. + !Res.second.IRName.empty()) + GUIDPreservedSymbols.insert(GlobalValue::getGUID(Res.second.IRName)); + } + + auto DeadSymbols = + computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols); + StringMap ImportLists( ThinLTO.ModuleMap.size()); StringMap ExportLists( @@ -856,12 +878,21 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, if (Conf.OptLevel > 0) { ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, - ImportLists, ExportLists); + ImportLists, ExportLists, &DeadSymbols); std::set ExportedGUIDs; for (auto &Res : GlobalResolutions) { - if (!Res.second.IRName.empty() && - Res.second.Partition == GlobalResolution::External) + // First check if the symbol was flagged as having external references. + if (Res.second.Partition != GlobalResolution::External) + continue; + // IRName will be defined if we have seen the prevailing copy of + // this value. If not, no need to mark as exported from a ThinLTO + // partition (and we can't get the GUID). + if (Res.second.IRName.empty()) + continue; + auto GUID = GlobalValue::getGUID(Res.second.IRName); + // Mark exported unless index-based analysis determined it to be dead. + if (!DeadSymbols.count(GUID)) ExportedGUIDs.insert(GlobalValue::getGUID(Res.second.IRName)); } diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp index 880dc3dfae9..66ffe6db29d 100644 --- a/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/lib/LTO/ThinLTOCodeGenerator.cpp @@ -581,11 +581,18 @@ void ThinLTOCodeGenerator::promote(Module &TheModule, StringMap ModuleToDefinedGVSummaries; Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries); + // Convert the preserved symbols set from string to GUID + auto GUIDPreservedSymbols = computeGUIDPreservedSymbols( + PreservedSymbols, Triple(TheModule.getTargetTriple())); + + // Compute "dead" symbols, we don't want to import/export these! + auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols); + // Generate import/export list StringMap ImportLists(ModuleCount); StringMap ExportLists(ModuleCount); ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists, - ExportLists); + ExportLists, &DeadSymbols); // Resolve LinkOnce/Weak symbols. StringMap> ResolvedODR; @@ -594,10 +601,6 @@ void ThinLTOCodeGenerator::promote(Module &TheModule, thinLTOResolveWeakForLinkerModule( TheModule, ModuleToDefinedGVSummaries[ModuleIdentifier]); - // Convert the preserved symbols set from string to GUID - auto GUIDPreservedSymbols = computeGUIDPreservedSymbols( - PreservedSymbols, Triple(TheModule.getTargetTriple())); - // Promote the exported values in the index, so that they are promoted // in the module. auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) { @@ -623,11 +626,18 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule, StringMap ModuleToDefinedGVSummaries(ModuleCount); Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries); + // Convert the preserved symbols set from string to GUID + auto GUIDPreservedSymbols = computeGUIDPreservedSymbols( + PreservedSymbols, Triple(TheModule.getTargetTriple())); + + // Compute "dead" symbols, we don't want to import/export these! + auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols); + // Generate import/export list StringMap ImportLists(ModuleCount); StringMap ExportLists(ModuleCount); ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists, - ExportLists); + ExportLists, &DeadSymbols); auto &ImportList = ImportLists[TheModule.getModuleIdentifier()]; crossImportIntoModule(TheModule, Index, ModuleMap, ImportList); @@ -697,11 +707,14 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule, StringMap ModuleToDefinedGVSummaries(ModuleCount); Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries); + // Compute "dead" symbols, we don't want to import/export these! + auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols); + // Generate import/export list StringMap ImportLists(ModuleCount); StringMap ExportLists(ModuleCount); ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists, - ExportLists); + ExportLists, &DeadSymbols); auto &ExportList = ExportLists[ModuleIdentifier]; // Be friendly and don't nuke totally the module when the client didn't @@ -836,17 +849,20 @@ void ThinLTOCodeGenerator::run() { StringMap ModuleToDefinedGVSummaries(ModuleCount); Index->collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries); + // Convert the preserved symbols set from string to GUID, this is needed for + // computing the caching hash and the internalization. + auto GUIDPreservedSymbols = + computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple); + + // Compute "dead" symbols, we don't want to import/export these! + auto DeadSymbols = computeDeadSymbols(*Index, GUIDPreservedSymbols); + // Collect the import/export lists for all modules from the call-graph in the // combined index. StringMap ImportLists(ModuleCount); StringMap ExportLists(ModuleCount); ComputeCrossModuleImport(*Index, ModuleToDefinedGVSummaries, ImportLists, - ExportLists); - - // Convert the preserved symbols set from string to GUID, this is needed for - // computing the caching hash and the internalization. - auto GUIDPreservedSymbols = - computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple); + ExportLists, &DeadSymbols); // We use a std::map here to be able to have a defined ordering when // producing a hash for the cache entry. diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp index 0c0b498f137..fb8b45166a4 100644 --- a/lib/Support/APInt.cpp +++ b/lib/Support/APInt.cpp @@ -205,7 +205,7 @@ APInt& APInt::operator++() { /// This function subtracts a single "digit" (64-bit word), y, from /// the multi-digit integer array, x[], propagating the borrowed 1 value until -/// no further borrowing is neeeded or it runs out of "digits" in x. The result +/// no further borrowing is needed or it runs out of "digits" in x. The result /// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted. /// In other words, if y > x then this function returns 1, otherwise 0. /// @returns the borrow out of the subtraction diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt index ca344b1dc05..15418ad2fd0 100644 --- a/lib/Support/CMakeLists.txt +++ b/lib/Support/CMakeLists.txt @@ -90,6 +90,7 @@ add_llvm_library(LLVMSupport StringSaver.cpp StringRef.cpp SystemUtils.cpp + TarWriter.cpp TargetParser.cpp ThreadPool.cpp Timer.cpp diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 49d0ed55a71..8a09589aa88 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -474,15 +474,25 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model, break; // Skylake: - case 0x4e: - *Type = INTEL_COREI7; // "skylake-avx512" - *Subtype = INTEL_COREI7_SKYLAKE_AVX512; - break; - case 0x5e: + case 0x4e: // Skylake mobile + case 0x5e: // Skylake desktop + case 0x8e: // Kaby Lake mobile + case 0x9e: // Kaby Lake desktop *Type = INTEL_COREI7; // "skylake" *Subtype = INTEL_COREI7_SKYLAKE; break; + // Skylake Xeon: + case 0x55: + *Type = INTEL_COREI7; + // Check that we really have AVX512 + if (Features & (1 << FEATURE_AVX512)) { + *Subtype = INTEL_COREI7_SKYLAKE_AVX512; // "skylake-avx512" + } else { + *Subtype = INTEL_COREI7_SKYLAKE; // "skylake" + } + break; + case 0x1c: // Most 45 nm Intel Atom processors case 0x26: // 45 nm Atom Lincroft case 0x27: // 32 nm Atom Medfield diff --git a/lib/Support/TarWriter.cpp b/lib/Support/TarWriter.cpp new file mode 100644 index 00000000000..5fc17d27637 --- /dev/null +++ b/lib/Support/TarWriter.cpp @@ -0,0 +1,166 @@ +//===-- TarWriter.cpp - Tar archive file creator --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TarWriter class provides a feature to create a tar archive file. +// +// I put emphasis on simplicity over comprehensiveness when implementing this +// class because we don't need a full-fledged archive file generator in LLVM +// at the moment. +// +// The filename field in the Unix V7 tar header is 100 bytes. Longer filenames +// are stored using the PAX extension. The PAX header is standardized in +// POSIX.1-2001. +// +// The struct definition of UstarHeader is copied from +// https://www.freebsd.org/cgi/man.cgi?query=tar&sektion=5 +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/TarWriter.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; + +// Each file in an archive must be aligned to this block size. +static const int BlockSize = 512; + +struct UstarHeader { + char Name[100]; + char Mode[8]; + char Uid[8]; + char Gid[8]; + char Size[12]; + char Mtime[12]; + char Checksum[8]; + char TypeFlag; + char Linkname[100]; + char Magic[6]; + char Version[2]; + char Uname[32]; + char Gname[32]; + char DevMajor[8]; + char DevMinor[8]; + char Prefix[155]; + char Pad[12]; +}; +static_assert(sizeof(UstarHeader) == BlockSize, "invalid Ustar header"); + +// A PAX attribute is in the form of " =\n" +// where is the length of the entire string including +// the length field itself. An example string is this. +// +// 25 ctime=1084839148.1212\n +// +// This function create such string. +static std::string formatPax(StringRef Key, StringRef Val) { + int Len = Key.size() + Val.size() + 3; // +3 for " ", "=" and "\n" + + // We need to compute total size twice because appending + // a length field could change total size by one. + int Total = Len + Twine(Len).str().size(); + Total = Len + Twine(Total).str().size(); + return (Twine(Total) + " " + Key + "=" + Val + "\n").str(); +} + +// Headers in tar files must be aligned to 512 byte boundaries. +// This function forwards the current file position to the next boundary. +static void pad(raw_fd_ostream &OS) { + uint64_t Pos = OS.tell(); + OS.seek(alignTo(Pos, BlockSize)); +} + +// Computes a checksum for a tar header. +static void computeChecksum(UstarHeader &Hdr) { + // Before computing a checksum, checksum field must be + // filled with space characters. + memset(Hdr.Checksum, ' ', sizeof(Hdr.Checksum)); + + // Compute a checksum and set it to the checksum field. + unsigned Chksum = 0; + for (size_t I = 0; I < sizeof(Hdr); ++I) + Chksum += reinterpret_cast(&Hdr)[I]; + snprintf(Hdr.Checksum, sizeof(Hdr.Checksum), "%06o", Chksum); +} + +// Create a tar header and write it to a given output stream. +static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) { + // A PAX header consists of a 512-byte header followed + // by key-value strings. First, create key-value strings. + std::string PaxAttr = formatPax("path", Path); + + // Create a 512-byte header. + UstarHeader Hdr = {}; + snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", PaxAttr.size()); + Hdr.TypeFlag = 'x'; // PAX magic + memcpy(Hdr.Magic, "ustar", 6); // Ustar magic + computeChecksum(Hdr); + + // Write them down. + OS << StringRef(reinterpret_cast(&Hdr), sizeof(Hdr)); + OS << PaxAttr; + pad(OS); +} + +// The PAX header is an extended format, so a PAX header needs +// to be followed by a "real" header. +static void writeUstarHeader(raw_fd_ostream &OS, StringRef Path, size_t Size) { + UstarHeader Hdr = {}; + memcpy(Hdr.Name, Path.data(), Path.size()); + memcpy(Hdr.Mode, "0000664", 8); + snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", Size); + memcpy(Hdr.Magic, "ustar", 6); + computeChecksum(Hdr); + OS << StringRef(reinterpret_cast(&Hdr), sizeof(Hdr)); +} + +// We want to use '/' as a path separator even on Windows. +// This function canonicalizes a given path. +static std::string canonicalize(std::string S) { +#ifdef LLVM_ON_WIN32 + std::replace(S.begin(), S.end(), '\\', '/'); +#endif + return S; +} + +// Creates a TarWriter instance and returns it. +Expected> TarWriter::create(StringRef OutputPath, + StringRef BaseDir) { + int FD; + if (std::error_code EC = openFileForWrite(OutputPath, FD, sys::fs::F_None)) + return make_error("cannot open " + OutputPath, EC); + return std::unique_ptr(new TarWriter(FD, BaseDir)); +} + +TarWriter::TarWriter(int FD, StringRef BaseDir) + : OS(FD, /*shouldClose=*/true, /*unbuffered=*/false), BaseDir(BaseDir) {} + +// Append a given file to an archive. +void TarWriter::append(StringRef Path, StringRef Data) { + // Write Path and Data. + std::string S = BaseDir + "/" + canonicalize(Path) + "\0"; + if (S.size() <= sizeof(UstarHeader::Name)) { + writeUstarHeader(OS, S, Data.size()); + } else { + writePaxHeader(OS, S); + writeUstarHeader(OS, "", Data.size()); + } + + OS << Data; + pad(OS); + + // POSIX requires tar archives end with two null blocks. + // Here, we write the terminator and then seek back, so that + // the file being output is terminated correctly at any moment. + uint64_t Pos = OS.tell(); + OS << std::string(BlockSize * 2, '\0'); + OS.seek(Pos); + OS.flush(); +} diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc index 3750d7f4c09..9752b70644c 100644 --- a/lib/Support/Unix/Signals.inc +++ b/lib/Support/Unix/Signals.inc @@ -48,7 +48,7 @@ // _Unwind_Backtrace function, but on FreeBSD the configure test passes // despite the function not existing, and on Android, conflicts // with . -#if defined(__GLIBC__) || defined(__APPLE__) +#ifdef __GLIBC__ #include #else #undef HAVE__UNWIND_BACKTRACE diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp index 7666011f75b..17aafa0c3d6 100644 --- a/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -110,72 +110,34 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; #define DEBUG_TYPE "aarch64-collect-loh" -static cl::opt -PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden, - cl::desc("Restrict analysis to registers invovled" - " in LOHs"), - cl::init(true)); - -static cl::opt -BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden, - cl::desc("Restrict analysis at basic block scope"), - cl::init(true)); - STATISTIC(NumADRPSimpleCandidate, "Number of simplifiable ADRP dominate by another"); -#ifndef NDEBUG -STATISTIC(NumADRPComplexCandidate2, - "Number of simplifiable ADRP reachable by 2 defs"); -STATISTIC(NumADRPComplexCandidate3, - "Number of simplifiable ADRP reachable by 3 defs"); -STATISTIC(NumADRPComplexCandidateOther, - "Number of simplifiable ADRP reachable by 4 or more defs"); -STATISTIC(NumADDToSTRWithImm, - "Number of simplifiable STR with imm reachable by ADD"); -STATISTIC(NumLDRToSTRWithImm, - "Number of simplifiable STR with imm reachable by LDR"); STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD"); STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR"); -STATISTIC(NumADDToLDRWithImm, - "Number of simplifiable LDR with imm reachable by ADD"); -STATISTIC(NumLDRToLDRWithImm, - "Number of simplifiable LDR with imm reachable by LDR"); STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD"); STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR"); -#endif // NDEBUG STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP"); -#ifndef NDEBUG -STATISTIC(NumCplxLvl1, "Number of complex case of level 1"); -STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1"); -STATISTIC(NumCplxLvl2, "Number of complex case of level 2"); -STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2"); -#endif // NDEBUG STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD"); -STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD"); #define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)" namespace { + struct AArch64CollectLOH : public MachineFunctionPass { static char ID; - AArch64CollectLOH() : MachineFunctionPass(ID) { - initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry()); - } + AArch64CollectLOH() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -187,351 +149,57 @@ struct AArch64CollectLOH : public MachineFunctionPass { StringRef getPassName() const override { return AARCH64_COLLECT_LOH_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired(); + AU.setPreservesAll(); } - -private: }; -/// A set of MachineInstruction. -typedef SetVector SetOfMachineInstr; -/// Map a basic block to a set of instructions per register. -/// This is used to represent the exposed uses of a basic block -/// per register. -typedef MapVector> -BlockToSetOfInstrsPerColor; -/// Map a basic block to an instruction per register. -/// This is used to represent the live-out definitions of a basic block -/// per register. -typedef MapVector> -BlockToInstrPerColor; -/// Map an instruction to a set of instructions. Used to represent the -/// mapping def to reachable uses or use to definitions. -typedef MapVector InstrToInstrs; -/// Map a basic block to a BitVector. -/// This is used to record the kill registers per basic block. -typedef MapVector BlockToRegSet; - -/// Map a register to a dense id. -typedef DenseMap MapRegToId; -/// Map a dense id to a register. Used for debug purposes. -typedef SmallVector MapIdToReg; -} // end anonymous namespace. - char AArch64CollectLOH::ID = 0; -INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh", - AARCH64_COLLECT_LOH_NAME, false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh", - AARCH64_COLLECT_LOH_NAME, false, false) - -/// Given a couple (MBB, reg) get the corresponding set of instruction from -/// the given "sets". -/// If this couple does not reference any set, an empty set is added to "sets" -/// for this couple and returned. -/// \param nbRegs is used internally allocate some memory. It must be consistent -/// with the way sets is used. -static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets, - const MachineBasicBlock &MBB, unsigned reg, - unsigned nbRegs) { - SetOfMachineInstr *result; - BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB); - if (it != sets.end()) - result = it->second.get(); - else - result = (sets[&MBB] = make_unique(nbRegs)).get(); - - return result[reg]; -} - -/// Given a couple (reg, MI) get the corresponding set of instructions from the -/// the given "sets". -/// This is used to get the uses record in sets of a definition identified by -/// MI and reg, i.e., MI defines reg. -/// If the couple does not reference anything, an empty set is added to -/// "sets[reg]". -/// \pre set[reg] is valid. -static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg, - const MachineInstr &MI) { - return sets[reg][&MI]; -} - -/// Same as getUses but does not modify the input map: sets. -/// \return NULL if the couple (reg, MI) is not in sets. -static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg, - const MachineInstr &MI) { - InstrToInstrs::const_iterator Res = sets[reg].find(&MI); - if (Res != sets[reg].end()) - return &(Res->second); - return nullptr; -} - -/// Initialize the reaching definition algorithm: -/// For each basic block BB in MF, record: -/// - its kill set. -/// - its reachable uses (uses that are exposed to BB's predecessors). -/// - its the generated definitions. -/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to -/// the list of uses of exposed defintions. -/// \param ADRPMode specifies to only consider ADRP instructions for generated -/// definition. It also consider definitions of ADRP instructions as uses and -/// ignore other uses. The ADRPMode is used to collect the information for LHO -/// that involve ADRP operation only. -static void initReachingDef(const MachineFunction &MF, - InstrToInstrs *ColorOpToReachedUses, - BlockToInstrPerColor &Gen, BlockToRegSet &Kill, - BlockToSetOfInstrsPerColor &ReachableUses, - const MapRegToId &RegToId, - const MachineInstr *DummyOp, bool ADRPMode) { - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - unsigned NbReg = RegToId.size(); - - for (const MachineBasicBlock &MBB : MF) { - auto &BBGen = Gen[&MBB]; - BBGen = make_unique(NbReg); - std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr); - - BitVector &BBKillSet = Kill[&MBB]; - BBKillSet.resize(NbReg); - for (const MachineInstr &MI : MBB) { - bool IsADRP = MI.getOpcode() == AArch64::ADRP; - - // Process uses first. - if (IsADRP || !ADRPMode) - for (const MachineOperand &MO : MI.operands()) { - // Treat ADRP def as use, as the goal of the analysis is to find - // ADRP defs reached by other ADRP defs. - if (!MO.isReg() || (!ADRPMode && !MO.isUse()) || - (ADRPMode && (!IsADRP || !MO.isDef()))) - continue; - unsigned CurReg = MO.getReg(); - MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg); - if (ItCurRegId == RegToId.end()) - continue; - CurReg = ItCurRegId->second; - - // if CurReg has not been defined, this use is reachable. - if (!BBGen[CurReg] && !BBKillSet.test(CurReg)) - getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI); - // current basic block definition for this color, if any, is in Gen. - if (BBGen[CurReg]) - getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI); - } - - // Process clobbers. - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isRegMask()) - continue; - // Clobbers kill the related colors. - const uint32_t *PreservedRegs = MO.getRegMask(); - - // Set generated regs. - for (const auto &Entry : RegToId) { - unsigned Reg = Entry.second; - // Use the global register ID when querying APIs external to this - // pass. - if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) { - // Do not register clobbered definition for no ADRP. - // This definition is not used anyway (otherwise register - // allocation is wrong). - BBGen[Reg] = ADRPMode ? &MI : nullptr; - BBKillSet.set(Reg); - } - } - } - - // Process register defs. - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || !MO.isDef()) - continue; - unsigned CurReg = MO.getReg(); - MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg); - if (ItCurRegId == RegToId.end()) - continue; - - for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) { - MapRegToId::const_iterator ItRegId = RegToId.find(*AI); - // If this alias has not been recorded, then it is not interesting - // for the current analysis. - // We can end up in this situation because of tuple registers. - // E.g., Let say we are interested in S1. When we register - // S1, we will also register its aliases and in particular - // the tuple Q1_Q2. - // Now, when we encounter Q1_Q2, we will look through its aliases - // and will find that S2 is not registered. - if (ItRegId == RegToId.end()) - continue; - - BBKillSet.set(ItRegId->second); - BBGen[ItRegId->second] = &MI; - } - BBGen[ItCurRegId->second] = &MI; - } - } - - // If we restrict our analysis to basic block scope, conservatively add a - // dummy - // use for each generated value. - if (!ADRPMode && DummyOp && !MBB.succ_empty()) - for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) - if (BBGen[CurReg]) - getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp); - } -} - -/// Reaching def core algorithm: -/// while an Out has changed -/// for each bb -/// for each color -/// In[bb][color] = U Out[bb.predecessors][color] -/// insert reachableUses[bb][color] in each in[bb][color] -/// op.reachedUses -/// -/// Out[bb] = Gen[bb] U (In[bb] - Kill[bb]) -static void reachingDefAlgorithm(const MachineFunction &MF, - InstrToInstrs *ColorOpToReachedUses, - BlockToSetOfInstrsPerColor &In, - BlockToSetOfInstrsPerColor &Out, - BlockToInstrPerColor &Gen, BlockToRegSet &Kill, - BlockToSetOfInstrsPerColor &ReachableUses, - unsigned NbReg) { - bool HasChanged; - do { - HasChanged = false; - for (const MachineBasicBlock &MBB : MF) { - unsigned CurReg; - for (CurReg = 0; CurReg < NbReg; ++CurReg) { - SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg); - SetOfMachineInstr &BBReachableUses = - getSet(ReachableUses, MBB, CurReg, NbReg); - SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg); - unsigned Size = BBOutSet.size(); - // In[bb][color] = U Out[bb.predecessors][color] - for (const MachineBasicBlock *PredMBB : MBB.predecessors()) { - SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg); - BBInSet.insert(PredOutSet.begin(), PredOutSet.end()); - } - // insert reachableUses[bb][color] in each in[bb][color] op.reachedses - for (const MachineInstr *MI : BBInSet) { - SetOfMachineInstr &OpReachedUses = - getUses(ColorOpToReachedUses, CurReg, *MI); - OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end()); - } - // Out[bb] = Gen[bb] U (In[bb] - Kill[bb]) - if (!Kill[&MBB].test(CurReg)) - BBOutSet.insert(BBInSet.begin(), BBInSet.end()); - if (Gen[&MBB][CurReg]) - BBOutSet.insert(Gen[&MBB][CurReg]); - HasChanged |= BBOutSet.size() != Size; - } - } - } while (HasChanged); -} - -/// Reaching definition algorithm. -/// \param MF function on which the algorithm will operate. -/// \param[out] ColorOpToReachedUses will contain the result of the reaching -/// def algorithm. -/// \param ADRPMode specify whether the reaching def algorithm should be tuned -/// for ADRP optimization. \see initReachingDef for more details. -/// \param DummyOp if not NULL, the algorithm will work at -/// basic block scope and will set for every exposed definition a use to -/// @p DummyOp. -/// \pre ColorOpToReachedUses is an array of at least number of registers of -/// InstrToInstrs. -static void reachingDef(const MachineFunction &MF, - InstrToInstrs *ColorOpToReachedUses, - const MapRegToId &RegToId, bool ADRPMode = false, - const MachineInstr *DummyOp = nullptr) { - // structures: - // For each basic block. - // Out: a set per color of definitions that reach the - // out boundary of this block. - // In: Same as Out but for in boundary. - // Gen: generated color in this block (one operation per color). - // Kill: register set of killed color in this block. - // ReachableUses: a set per color of uses (operation) reachable - // for "In" definitions. - BlockToSetOfInstrsPerColor Out, In, ReachableUses; - BlockToInstrPerColor Gen; - BlockToRegSet Kill; - - // Initialize Gen, kill and reachableUses. - initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId, - DummyOp, ADRPMode); - - // Algo. - if (!DummyOp) - reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill, - ReachableUses, RegToId.size()); -} +} // end anonymous namespace. -#ifndef NDEBUG -/// print the result of the reaching definition algorithm. -static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses, - unsigned NbReg, const TargetRegisterInfo *TRI, - const MapIdToReg &IdToReg) { - unsigned CurReg; - for (CurReg = 0; CurReg < NbReg; ++CurReg) { - if (ColorOpToReachedUses[CurReg].empty()) - continue; - DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n"); +INITIALIZE_PASS(AArch64CollectLOH, "aarch64-collect-loh", + AARCH64_COLLECT_LOH_NAME, false, false) - for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) { - DEBUG(dbgs() << "Def:\n"); - DEBUG(DefsIt.first->print(dbgs())); - DEBUG(dbgs() << "Reachable uses:\n"); - for (const MachineInstr *MI : DefsIt.second) { - DEBUG(MI->print(dbgs())); - } - } +static bool canAddBePartOfLOH(const MachineInstr &MI) { + // Check immediate to see if the immediate is an address. + switch (MI.getOperand(2).getType()) { + default: + return false; + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_BlockAddress: + return true; } } -#endif // NDEBUG /// Answer the following question: Can Def be one of the definition /// involved in a part of a LOH? -static bool canDefBePartOfLOH(const MachineInstr *Def) { - unsigned Opc = Def->getOpcode(); +static bool canDefBePartOfLOH(const MachineInstr &MI) { // Accept ADRP, ADDLow and LOADGot. - switch (Opc) { + switch (MI.getOpcode()) { default: return false; case AArch64::ADRP: return true; case AArch64::ADDXri: - // Check immediate to see if the immediate is an address. - switch (Def->getOperand(2).getType()) { - default: - return false; - case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_JumpTableIndex: - case MachineOperand::MO_ConstantPoolIndex: - case MachineOperand::MO_BlockAddress: - return true; - } + return canAddBePartOfLOH(MI); case AArch64::LDRXui: // Check immediate to see if the immediate is an address. - switch (Def->getOperand(2).getType()) { + switch (MI.getOperand(2).getType()) { default: return false; case MachineOperand::MO_GlobalAddress: - return true; + return MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT; } } - // Unreachable. - return false; } /// Check whether the given instruction can the end of a LOH chain involving a /// store. -static bool isCandidateStore(const MachineInstr *Instr) { - switch (Instr->getOpcode()) { +static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) { + switch (MI.getOpcode()) { default: return false; case AArch64::STRBBui: @@ -543,109 +211,19 @@ static bool isCandidateStore(const MachineInstr *Instr) { case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: + // We can only optimize the index operand. // In case we have str xA, [xA, #imm], this is two different uses // of xA and we cannot fold, otherwise the xA stored may be wrong, // even if #imm == 0. - if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg()) - return true; - } - return false; -} - -/// Given the result of a reaching definition algorithm in ColorOpToReachedUses, -/// Build the Use to Defs information and filter out obvious non-LOH candidates. -/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions. -/// In non-ADRPMode, non-LOH candidates are "uses" with several definition, -/// i.e., no simple chain. -/// \param ADRPMode -- \see initReachingDef. -static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs, - const InstrToInstrs *ColorOpToReachedUses, - const MapRegToId &RegToId, - bool ADRPMode = false) { - - SetOfMachineInstr NotCandidate; - unsigned NbReg = RegToId.size(); - MapRegToId::const_iterator EndIt = RegToId.end(); - for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) { - // If this color is never defined, continue. - if (ColorOpToReachedUses[CurReg].empty()) - continue; - - for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) { - for (const MachineInstr *MI : DefsIt.second) { - const MachineInstr *Def = DefsIt.first; - MapRegToId::const_iterator It; - // if all the reaching defs are not adrp, this use will not be - // simplifiable. - if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) || - (!ADRPMode && !canDefBePartOfLOH(Def)) || - (!ADRPMode && isCandidateStore(MI) && - // store are LOH candidate iff the end of the chain is used as - // base. - ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt || - It->second != CurReg))) { - NotCandidate.insert(MI); - continue; - } - // Do not consider self reaching as a simplifiable case for ADRP. - if (!ADRPMode || MI != DefsIt.first) { - UseToReachingDefs[MI].insert(DefsIt.first); - // If UsesIt has several reaching definitions, it is not - // candidate for simplificaton in non-ADRPMode. - if (!ADRPMode && UseToReachingDefs[MI].size() > 1) - NotCandidate.insert(MI); - } - } - } - } - for (const MachineInstr *Elem : NotCandidate) { - DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n"); - // It would have been better if we could just remove the entry - // from the map. Because of that, we have to filter the garbage - // (second.empty) in the subsequence analysis. - UseToReachingDefs[Elem].clear(); - } -} - -/// Based on the use to defs information (in ADRPMode), compute the -/// opportunities of LOH ADRP-related. -static void computeADRP(const InstrToInstrs &UseToDefs, - AArch64FunctionInfo &AArch64FI, - const MachineDominatorTree *MDT) { - DEBUG(dbgs() << "*** Compute LOH for ADRP\n"); - for (const auto &Entry : UseToDefs) { - unsigned Size = Entry.second.size(); - if (Size == 0) - continue; - if (Size == 1) { - const MachineInstr *L2 = *Entry.second.begin(); - const MachineInstr *L1 = Entry.first; - if (!MDT->dominates(L2, L1)) { - DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1 - << '\n'); - continue; - } - DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n'); - AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, {L2, L1}); - ++NumADRPSimpleCandidate; - } -#ifndef NDEBUG - else if (Size == 2) - ++NumADRPComplexCandidate2; - else if (Size == 3) - ++NumADRPComplexCandidate3; - else - ++NumADRPComplexCandidateOther; -#endif - // if Size < 1, the use should have been removed from the candidates - assert(Size >= 1 && "No reaching defs for that use!"); + return MI.getOperandNo(&MO) == 1 && + MI.getOperand(0).getReg() != MI.getOperand(1).getReg(); } } /// Check whether the given instruction can be the end of a LOH chain /// involving a load. -static bool isCandidateLoad(const MachineInstr *Instr) { - switch (Instr->getOpcode()) { +static bool isCandidateLoad(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return false; case AArch64::LDRSBWui: @@ -660,17 +238,13 @@ static bool isCandidateLoad(const MachineInstr *Instr) { case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: - if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT) - return false; - return true; + return !(MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT); } - // Unreachable. - return false; } /// Check whether the given instruction can load a litteral. -static bool supportLoadFromLiteral(const MachineInstr *Instr) { - switch (Instr->getOpcode()) { +static bool supportLoadFromLiteral(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return false; case AArch64::LDRSWui: @@ -681,353 +255,233 @@ static bool supportLoadFromLiteral(const MachineInstr *Instr) { case AArch64::LDRQui: return true; } - // Unreachable. - return false; } -/// Check whether the given instruction is a LOH candidate. -/// \param UseToDefs is used to check that Instr is at the end of LOH supported -/// chain. -/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are -/// already been filtered out. -static bool isCandidate(const MachineInstr *Instr, - const InstrToInstrs &UseToDefs, - const MachineDominatorTree *MDT) { - if (!isCandidateLoad(Instr) && !isCandidateStore(Instr)) - return false; +/// Number of GPR registers traked by mapRegToGPRIndex() +static const unsigned N_GPR_REGS = 31; +/// Map register number to index from 0-30. +static int mapRegToGPRIndex(MCPhysReg Reg) { + static_assert(AArch64::X28 - AArch64::X0 + 3 == N_GPR_REGS, "Number of GPRs"); + static_assert(AArch64::W30 - AArch64::W0 + 1 == N_GPR_REGS, "Number of GPRs"); + if (AArch64::X0 <= Reg && Reg <= AArch64::X28) + return Reg - AArch64::X0; + if (AArch64::W0 <= Reg && Reg <= AArch64::W30) + return Reg - AArch64::W0; + // TableGen gives "FP" and "LR" an index not adjacent to X28 so we have to + // handle them as special cases. + if (Reg == AArch64::FP) + return 29; + if (Reg == AArch64::LR) + return 30; + return -1; +} - const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin(); - if (Def->getOpcode() != AArch64::ADRP) { - // At this point, Def is ADDXri or LDRXui of the right type of - // symbol, because we filtered out the uses that were not defined - // by these kind of instructions (+ ADRP). +/// State tracked per register. +/// The main algorithm walks backwards over a basic block maintaining this +/// datastructure for each tracked general purpose register. +struct LOHInfo { + MCLOHType Type : 8; ///< "Best" type of LOH possible. + bool IsCandidate : 1; ///< Possible LOH candidate. + bool OneUser : 1; ///< Found exactly one user (yet). + bool MultiUsers : 1; ///< Found multiple users. + const MachineInstr *MI0; ///< First instruction involved in the LOH. + const MachineInstr *MI1; ///< Second instruction involved in the LOH + /// (if any). + const MachineInstr *LastADRP; ///< Last ADRP in same register. +}; - // Check if this forms a simple chain: each intermediate node must - // dominates the next one. - if (!MDT->dominates(Def, Instr)) - return false; - // Move one node up in the simple chain. - if (UseToDefs.find(Def) == - UseToDefs.end() - // The map may contain garbage we have to ignore. - || - UseToDefs.find(Def)->second.empty()) - return false; - Instr = Def; - Def = *UseToDefs.find(Def)->second.begin(); +/// Update state \p Info given \p MI uses the tracked register. +static void handleUse(const MachineInstr &MI, const MachineOperand &MO, + LOHInfo &Info) { + // We have multiple uses if we already found one before. + if (Info.MultiUsers || Info.OneUser) { + Info.IsCandidate = false; + Info.MultiUsers = true; + return; } - // Check if we reached the top of the simple chain: - // - top is ADRP. - // - check the simple chain property: each intermediate node must - // dominates the next one. - if (Def->getOpcode() == AArch64::ADRP) - return MDT->dominates(Def, Instr); - return false; -} - -static bool registerADRCandidate(const MachineInstr &Use, - const InstrToInstrs &UseToDefs, - const InstrToInstrs *DefsPerColorToUses, - AArch64FunctionInfo &AArch64FI, - SetOfMachineInstr *InvolvedInLOHs, - const MapRegToId &RegToId) { - // Look for opportunities to turn ADRP -> ADD or - // ADRP -> LDR GOTPAGEOFF into ADR. - // If ADRP has more than one use. Give up. - if (Use.getOpcode() != AArch64::ADDXri && - (Use.getOpcode() != AArch64::LDRXui || - !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT))) - return false; - InstrToInstrs::const_iterator It = UseToDefs.find(&Use); - // The map may contain garbage that we need to ignore. - if (It == UseToDefs.end() || It->second.empty()) - return false; - const MachineInstr &Def = **It->second.begin(); - if (Def.getOpcode() != AArch64::ADRP) - return false; - // Check the number of users of ADRP. - const SetOfMachineInstr *Users = - getUses(DefsPerColorToUses, - RegToId.find(Def.getOperand(0).getReg())->second, Def); - if (Users->size() > 1) { - ++NumADRComplexCandidate; - return false; + Info.OneUser = true; + + // Start new LOHInfo if applicable. + if (isCandidateLoad(MI)) { + Info.Type = MCLOH_AdrpLdr; + Info.IsCandidate = true; + Info.MI0 = &MI; + // Note that even this is AdrpLdr now, we can switch to a Ldr variant + // later. + } else if (isCandidateStore(MI, MO)) { + Info.Type = MCLOH_AdrpAddStr; + Info.IsCandidate = true; + Info.MI0 = &MI; + Info.MI1 = nullptr; + } else if (MI.getOpcode() == AArch64::ADDXri) { + Info.Type = MCLOH_AdrpAdd; + Info.IsCandidate = true; + Info.MI0 = &MI; + } else if (MI.getOpcode() == AArch64::LDRXui && + MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) { + Info.Type = MCLOH_AdrpLdrGot; + Info.IsCandidate = true; + Info.MI0 = &MI; } - ++NumADRSimpleCandidate; - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) && - "ADRP already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) && - "ADD already involved in LOH."); - DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n'); - - AArch64FI.addLOHDirective( - Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd : MCLOH_AdrpLdrGot, - {&Def, &Use}); - return true; } -/// Based on the use to defs information (in non-ADRPMode), compute the -/// opportunities of LOH non-ADRP-related -static void computeOthers(const InstrToInstrs &UseToDefs, - const InstrToInstrs *DefsPerColorToUses, - AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId, - const MachineDominatorTree *MDT) { - SetOfMachineInstr *InvolvedInLOHs = nullptr; -#ifndef NDEBUG - SetOfMachineInstr InvolvedInLOHsStorage; - InvolvedInLOHs = &InvolvedInLOHsStorage; -#endif // NDEBUG - DEBUG(dbgs() << "*** Compute LOH for Others\n"); - // ADRP -> ADD/LDR -> LDR/STR pattern. - // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern. +/// Update state \p Info given the tracked register is clobbered. +static void handleClobber(LOHInfo &Info) { + Info.IsCandidate = false; + Info.OneUser = false; + Info.MultiUsers = false; + Info.LastADRP = nullptr; +} - // FIXME: When the statistics are not important, - // This initial filtering loop can be merged into the next loop. - // Currently, we didn't do it to have the same code for both DEBUG and - // NDEBUG builds. Indeed, the iterator of the second loop would need - // to be changed. - SetOfMachineInstr PotentialCandidates; - SetOfMachineInstr PotentialADROpportunities; - for (auto &Use : UseToDefs) { - // If no definition is available, this is a non candidate. - if (Use.second.empty()) - continue; - // Keep only instructions that are load or store and at the end of - // a ADRP -> ADD/LDR/Nothing chain. - // We already filtered out the no-chain cases. - if (!isCandidate(Use.first, UseToDefs, MDT)) { - PotentialADROpportunities.insert(Use.first); - continue; +/// Update state \p Info given that \p MI is possibly the middle instruction +/// of an LOH involving 3 instructions. +static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo, + LOHInfo &OpInfo) { + if (!DefInfo.IsCandidate || (&DefInfo != &OpInfo && OpInfo.OneUser)) + return false; + // Copy LOHInfo for dest register to LOHInfo for source register. + if (&DefInfo != &OpInfo) { + OpInfo = DefInfo; + // Invalidate \p DefInfo because we track it in \p OpInfo now. + handleClobber(DefInfo); + } else + DefInfo.LastADRP = nullptr; + + // Advance state machine. + assert(OpInfo.IsCandidate && "Expect valid state"); + if (MI.getOpcode() == AArch64::ADDXri && canAddBePartOfLOH(MI)) { + if (OpInfo.Type == MCLOH_AdrpLdr) { + OpInfo.Type = MCLOH_AdrpAddLdr; + OpInfo.IsCandidate = true; + OpInfo.MI1 = &MI; + return true; + } else if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) { + OpInfo.Type = MCLOH_AdrpAddStr; + OpInfo.IsCandidate = true; + OpInfo.MI1 = &MI; + return true; } - PotentialCandidates.insert(Use.first); - } - - // Make the following distinctions for statistics as the linker does - // know how to decode instructions: - // - ADD/LDR/Nothing make there different patterns. - // - LDR/STR make two different patterns. - // Hence, 6 - 1 base patterns. - // (because ADRP-> Nothing -> STR is not simplifiable) - - // The linker is only able to have a simple semantic, i.e., if pattern A - // do B. - // However, we want to see the opportunity we may miss if we were able to - // catch more complex cases. - - // PotentialCandidates are result of a chain ADRP -> ADD/LDR -> - // A potential candidate becomes a candidate, if its current immediate - // operand is zero and all nodes of the chain have respectively only one user -#ifndef NDEBUG - SetOfMachineInstr DefsOfPotentialCandidates; -#endif - for (const MachineInstr *Candidate : PotentialCandidates) { - // Get the definition of the candidate i.e., ADD or LDR. - const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin(); - // Record the elements of the chain. - const MachineInstr *L1 = Def; - const MachineInstr *L2 = nullptr; - unsigned ImmediateDefOpc = Def->getOpcode(); - if (Def->getOpcode() != AArch64::ADRP) { - // Check the number of users of this node. - const SetOfMachineInstr *Users = - getUses(DefsPerColorToUses, - RegToId.find(Def->getOperand(0).getReg())->second, *Def); - if (Users->size() > 1) { -#ifndef NDEBUG - // if all the uses of this def are in potential candidate, this is - // a complex candidate of level 2. - bool IsLevel2 = true; - for (const MachineInstr *MI : *Users) { - if (!PotentialCandidates.count(MI)) { - ++NumTooCplxLvl2; - IsLevel2 = false; - break; - } - } - if (IsLevel2) - ++NumCplxLvl2; -#endif // NDEBUG - PotentialADROpportunities.insert(Def); - continue; - } - L2 = Def; - Def = *UseToDefs.find(Def)->second.begin(); - L1 = Def; - } // else the element in the middle of the chain is nothing, thus - // Def already contains the first element of the chain. - - // Check the number of users of the first node in the chain, i.e., ADRP - const SetOfMachineInstr *Users = - getUses(DefsPerColorToUses, - RegToId.find(Def->getOperand(0).getReg())->second, *Def); - if (Users->size() > 1) { -#ifndef NDEBUG - // if all the uses of this def are in the defs of the potential candidate, - // this is a complex candidate of level 1 - if (DefsOfPotentialCandidates.empty()) { - // lazy init - DefsOfPotentialCandidates = PotentialCandidates; - for (const MachineInstr *Candidate : PotentialCandidates) { - if (!UseToDefs.find(Candidate)->second.empty()) - DefsOfPotentialCandidates.insert( - *UseToDefs.find(Candidate)->second.begin()); - } - } - bool Found = false; - for (auto &Use : *Users) { - if (!DefsOfPotentialCandidates.count(Use)) { - ++NumTooCplxLvl1; - Found = true; - break; - } - } - if (!Found) - ++NumCplxLvl1; -#endif // NDEBUG - continue; + } else { + assert(MI.getOpcode() == AArch64::LDRXui && "Expect LDRXui"); + assert((MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) && + "Expected GOT relocation"); + if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) { + OpInfo.Type = MCLOH_AdrpLdrGotStr; + OpInfo.IsCandidate = true; + OpInfo.MI1 = &MI; + return true; + } else if (OpInfo.Type == MCLOH_AdrpLdr) { + OpInfo.Type = MCLOH_AdrpLdrGotLdr; + OpInfo.IsCandidate = true; + OpInfo.MI1 = &MI; + return true; } + } + return false; +} - bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri); - // If the chain is three instructions long and ldr is the second element, - // then this ldr must load form GOT, otherwise this is not a correct chain. - if (L2 && !IsL2Add && - !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)) - continue; - SmallVector Args; - MCLOHType Kind; - if (isCandidateLoad(Candidate)) { - if (!L2) { - // At this point, the candidate LOH indicates that the ldr instruction - // may use a direct access to the symbol. There is not such encoding - // for loads of byte and half. - if (!supportLoadFromLiteral(Candidate)) - continue; +/// Update state when seeing and ADRP instruction. +static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI, + LOHInfo &Info) { + if (Info.LastADRP != nullptr) { + DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n" << '\t' << MI << '\t' + << *Info.LastADRP); + AFI.addLOHDirective(MCLOH_AdrpAdrp, {&MI, Info.LastADRP}); + ++NumADRPSimpleCandidate; + } - DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate - << '\n'); - Kind = MCLOH_AdrpLdr; - Args.push_back(L1); - Args.push_back(Candidate); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && - "L1 already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && - "Candidate already involved in LOH."); + // Produce LOH directive if possible. + if (Info.IsCandidate) { + switch (Info.Type) { + case MCLOH_AdrpAdd: + DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n" << '\t' << MI << '\t' + << *Info.MI0); + AFI.addLOHDirective(MCLOH_AdrpAdd, {&MI, Info.MI0}); + ++NumADRSimpleCandidate; + break; + case MCLOH_AdrpLdr: + if (supportLoadFromLiteral(*Info.MI0)) { + DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n" << '\t' << MI << '\t' + << *Info.MI0); + AFI.addLOHDirective(MCLOH_AdrpLdr, {&MI, Info.MI0}); ++NumADRPToLDR; - } else { - DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot") - << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate - << '\n'); - - Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr; - Args.push_back(L1); - Args.push_back(L2); - Args.push_back(Candidate); - - PotentialADROpportunities.remove(L2); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && - "L1 already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) && - "L2 already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && - "Candidate already involved in LOH."); -#ifndef NDEBUG - // get the immediate of the load - if (Candidate->getOperand(2).getImm() == 0) - if (ImmediateDefOpc == AArch64::ADDXri) - ++NumADDToLDR; - else - ++NumLDRToLDR; - else if (ImmediateDefOpc == AArch64::ADDXri) - ++NumADDToLDRWithImm; - else - ++NumLDRToLDRWithImm; -#endif // NDEBUG } - } else { - if (ImmediateDefOpc == AArch64::ADRP) - continue; - else { - - DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot") - << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate - << '\n'); - - Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr; - Args.push_back(L1); - Args.push_back(L2); - Args.push_back(Candidate); - - PotentialADROpportunities.remove(L2); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && - "L1 already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) && - "L2 already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && - "Candidate already involved in LOH."); -#ifndef NDEBUG - // get the immediate of the store - if (Candidate->getOperand(2).getImm() == 0) - if (ImmediateDefOpc == AArch64::ADDXri) - ++NumADDToSTR; - else - ++NumLDRToSTR; - else if (ImmediateDefOpc == AArch64::ADDXri) - ++NumADDToSTRWithImm; - else - ++NumLDRToSTRWithImm; -#endif // DEBUG + break; + case MCLOH_AdrpAddLdr: + DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n" << '\t' << MI << '\t' + << *Info.MI1 << '\t' << *Info.MI0); + AFI.addLOHDirective(MCLOH_AdrpAddLdr, {&MI, Info.MI1, Info.MI0}); + ++NumADDToLDR; + break; + case MCLOH_AdrpAddStr: + if (Info.MI1 != nullptr) { + DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n" << '\t' << MI << '\t' + << *Info.MI1 << '\t' << *Info.MI0); + AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0}); + ++NumADDToSTR; } + break; + case MCLOH_AdrpLdrGotLdr: + DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n" << '\t' << MI << '\t' + << *Info.MI1 << '\t' << *Info.MI0); + AFI.addLOHDirective(MCLOH_AdrpLdrGotLdr, {&MI, Info.MI1, Info.MI0}); + ++NumLDRToLDR; + break; + case MCLOH_AdrpLdrGotStr: + DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotStr:\n" << '\t' << MI << '\t' + << *Info.MI1 << '\t' << *Info.MI0); + AFI.addLOHDirective(MCLOH_AdrpLdrGotStr, {&MI, Info.MI1, Info.MI0}); + ++NumLDRToSTR; + break; + case MCLOH_AdrpLdrGot: + DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGot:\n" << '\t' << MI << '\t' + << *Info.MI0); + AFI.addLOHDirective(MCLOH_AdrpLdrGot, {&MI, Info.MI0}); + break; + case MCLOH_AdrpAdrp: + llvm_unreachable("MCLOH_AdrpAdrp not used in state machine"); } - AArch64FI.addLOHDirective(Kind, Args); } - // Now, we grabbed all the big patterns, check ADR opportunities. - for (const MachineInstr *Candidate : PotentialADROpportunities) - registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI, - InvolvedInLOHs, RegToId); + handleClobber(Info); + Info.LastADRP = &MI; } -/// Look for every register defined by potential LOHs candidates. -/// Map these registers with dense id in @p RegToId and vice-versa in -/// @p IdToReg. @p IdToReg is populated only in DEBUG mode. -static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId, - MapIdToReg &IdToReg, - const TargetRegisterInfo *TRI) { - unsigned CurRegId = 0; - if (!PreCollectRegister) { - unsigned NbReg = TRI->getNumRegs(); - for (; CurRegId < NbReg; ++CurRegId) { - RegToId[CurRegId] = CurRegId; - DEBUG(IdToReg.push_back(CurRegId)); - DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches")); - } +static void handleRegMaskClobber(const uint32_t *RegMask, MCPhysReg Reg, + LOHInfo *LOHInfos) { + if (!MachineOperand::clobbersPhysReg(RegMask, Reg)) return; - } - - DEBUG(dbgs() << "** Collect Involved Register\n"); - for (const auto &MBB : MF) { - for (const MachineInstr &MI : MBB) { - if (!canDefBePartOfLOH(&MI) && - !isCandidateLoad(&MI) && !isCandidateStore(&MI)) - continue; + int Idx = mapRegToGPRIndex(Reg); + if (Idx >= 0) + handleClobber(LOHInfos[Idx]); +} - // Process defs - for (MachineInstr::const_mop_iterator IO = MI.operands_begin(), - IOEnd = MI.operands_end(); - IO != IOEnd; ++IO) { - if (!IO->isReg() || !IO->isDef()) - continue; - unsigned CurReg = IO->getReg(); - for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) - if (RegToId.find(*AI) == RegToId.end()) { - DEBUG(IdToReg.push_back(*AI); - assert(IdToReg[CurRegId] == *AI && - "Reg index mismatches insertion index.")); - RegToId[*AI] = CurRegId++; - DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n'); - } - } +static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) { + // Handle defs and regmasks. + for (const MachineOperand &MO : MI.operands()) { + if (MO.isRegMask()) { + const uint32_t *RegMask = MO.getRegMask(); + for (MCPhysReg Reg : AArch64::GPR32RegClass) + handleRegMaskClobber(RegMask, Reg, LOHInfos); + for (MCPhysReg Reg : AArch64::GPR64RegClass) + handleRegMaskClobber(RegMask, Reg, LOHInfos); + continue; } + if (!MO.isReg() || !MO.isDef()) + continue; + int Idx = mapRegToGPRIndex(MO.getReg()); + if (Idx < 0) + continue; + handleClobber(LOHInfos[Idx]); + } + // Handle uses. + for (const MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || !MO.readsReg()) + continue; + int Idx = mapRegToGPRIndex(MO.getReg()); + if (Idx < 0) + continue; + handleUse(MI, MO, LOHInfos[Idx]); } } @@ -1035,74 +489,59 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - const MachineDominatorTree *MDT = &getAnalysis(); - - MapRegToId RegToId; - MapIdToReg IdToReg; - AArch64FunctionInfo *AArch64FI = MF.getInfo(); - assert(AArch64FI && "No MachineFunctionInfo for this function!"); - - DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n'); + DEBUG(dbgs() << "********** AArch64 Collect LOH **********\n" + << "Looking in function " << MF.getName() << '\n'); - collectInvolvedReg(MF, RegToId, IdToReg, TRI); - if (RegToId.empty()) - return false; + LOHInfo LOHInfos[N_GPR_REGS]; + AArch64FunctionInfo &AFI = *MF.getInfo(); + for (const MachineBasicBlock &MBB : MF) { + // Reset register tracking state. + memset(LOHInfos, 0, sizeof(LOHInfos)); + // Live-out registers are used. + for (const MachineBasicBlock *Succ : MBB.successors()) { + for (const auto &LI : Succ->liveins()) { + int RegIdx = mapRegToGPRIndex(LI.PhysReg); + if (RegIdx >= 0) + LOHInfos[RegIdx].OneUser = true; + } + } - MachineInstr *DummyOp = nullptr; - if (BasicBlockScopeOnly) { - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - // For local analysis, create a dummy operation to record uses that are not - // local. - DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc()); + // Walk the basic block backwards and update the per register state machine + // in the process. + for (const MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AArch64::ADDXri: + case AArch64::LDRXui: + if (canDefBePartOfLOH(MI)) { + const MachineOperand &Def = MI.getOperand(0); + const MachineOperand &Op = MI.getOperand(1); + assert(Def.isReg() && Def.isDef() && "Expected reg def"); + assert(Op.isReg() && Op.isUse() && "Expected reg use"); + int DefIdx = mapRegToGPRIndex(Def.getReg()); + int OpIdx = mapRegToGPRIndex(Op.getReg()); + if (DefIdx >= 0 && OpIdx >= 0 && + handleMiddleInst(MI, LOHInfos[DefIdx], LOHInfos[OpIdx])) + continue; + } + break; + case AArch64::ADRP: + const MachineOperand &Op0 = MI.getOperand(0); + int Idx = mapRegToGPRIndex(Op0.getReg()); + if (Idx >= 0) { + handleADRP(MI, AFI, LOHInfos[Idx]); + continue; + } + break; + } + handleNormalInst(MI, LOHInfos); + } } - unsigned NbReg = RegToId.size(); - bool Modified = false; - - // Start with ADRP. - InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg]; - - // Compute the reaching def in ADRP mode, meaning ADRP definitions - // are first considered as uses. - reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp); - DEBUG(dbgs() << "ADRP reaching defs\n"); - DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg)); - - // Translate the definition to uses map into a use to definitions map to ease - // statistic computation. - InstrToInstrs ADRPToReachingDefs; - reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true); - - // Compute LOH for ADRP. - computeADRP(ADRPToReachingDefs, *AArch64FI, MDT); - delete[] ColorOpToReachedUses; - - // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern. - ColorOpToReachedUses = new InstrToInstrs[NbReg]; - - // first perform a regular reaching def analysis. - reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp); - DEBUG(dbgs() << "All reaching defs\n"); - DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg)); - - // Turn that into a use to defs to ease statistic computation. - InstrToInstrs UsesToReachingDefs; - reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false); - - // Compute other than AdrpAdrp LOH. - computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId, - MDT); - delete[] ColorOpToReachedUses; - - if (BasicBlockScopeOnly) - MF.DeleteMachineInstr(DummyOp); - - return Modified; + // Return "no change": The pass only collects information. + return false; } -/// createAArch64CollectLOHPass - returns an instance of the Statistic for -/// linker optimization pass. FunctionPass *llvm::createAArch64CollectLOHPass() { return new AArch64CollectLOH(); } diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 4c98253878e..74a01835171 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11,28 +11,79 @@ // //===----------------------------------------------------------------------===// -#include "AArch64ISelLowering.h" #include "AArch64CallingConvention.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64ISelLowering.h" #include "AArch64PerfectShuffle.h" +#include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" -#include "AArch64TargetObjectFile.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/OperandTraits.h" #include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetCallingConv.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + using namespace llvm; #define DEBUG_TYPE "aarch64-lower" @@ -59,7 +110,6 @@ static const MVT MVT_CC = MVT::i32; AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { - // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so // we have to make something up. Arbitrarily, choose ZeroOrOne. setBooleanContents(ZeroOrOneBooleanContent); @@ -218,7 +268,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); @@ -3632,6 +3681,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, llvm_unreachable("Unexpected platform trying to use TLS"); } + SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); @@ -4549,7 +4599,6 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, return DAG.getMergeValues(Ops, dl); } - /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, @@ -5074,10 +5123,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, int WindowBase; int WindowScale; - bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } ShuffleSourceInfo(SDValue Vec) - : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), - WindowScale(1) {} + : Vec(Vec), MinElt(std::numeric_limits::max()), MaxElt(0), + ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} + + bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } }; // First gather all vectors used as an immediate source for this BUILD_VECTOR @@ -7028,7 +7078,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } case Intrinsic::aarch64_ldaxp: - case Intrinsic::aarch64_ldxp: { + case Intrinsic::aarch64_ldxp: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(0); @@ -7038,9 +7088,8 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.readMem = true; Info.writeMem = false; return true; - } case Intrinsic::aarch64_stlxp: - case Intrinsic::aarch64_stxp: { + case Intrinsic::aarch64_stxp: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(2); @@ -7050,7 +7099,6 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.readMem = false; Info.writeMem = true; return true; - } default: break; } @@ -8044,13 +8092,13 @@ static SDValue tryCombineToEXTR(SDNode *N, SDValue LHS; uint32_t ShiftLHS = 0; - bool LHSFromHi = 0; + bool LHSFromHi = false; if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) return SDValue(); SDValue RHS; uint32_t ShiftRHS = 0; - bool RHSFromHi = 0; + bool RHSFromHi = false; if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) return SDValue(); @@ -9732,52 +9780,51 @@ static bool isEquivalentMaskless(unsigned CC, unsigned width, switch(CC) { case AArch64CC::LE: - case AArch64CC::GT: { + case AArch64CC::GT: if ((AddConstant == 0) || (CompConstant == MaxUInt - 1 && AddConstant < 0) || (AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) return true; - } break; + break; case AArch64CC::LT: - case AArch64CC::GE: { + case AArch64CC::GE: if ((AddConstant == 0) || (AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) return true; - } break; + break; case AArch64CC::HI: - case AArch64CC::LS: { + case AArch64CC::LS: if ((AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant >= -1 && CompConstant < AddConstant + MaxUInt)) return true; - } break; + break; case AArch64CC::PL: - case AArch64CC::MI: { + case AArch64CC::MI: if ((AddConstant == 0) || (AddConstant > 0 && CompConstant <= 0) || (AddConstant < 0 && CompConstant <= AddConstant)) return true; - } break; + break; case AArch64CC::LO: - case AArch64CC::HS: { + case AArch64CC::HS: if ((AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant >= 0 && CompConstant <= AddConstant + MaxUInt)) return true; - } break; + break; case AArch64CC::EQ: - case AArch64CC::NE: { + case AArch64CC::NE: if ((AddConstant > 0 && CompConstant < 0) || (AddConstant < 0 && CompConstant >= 0 && CompConstant < AddConstant + MaxUInt) || (AddConstant >= 0 && CompConstant >= 0 && CompConstant >= AddConstant) || (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) - return true; - } break; + break; case AArch64CC::VS: case AArch64CC::VC: case AArch64CC::AL: @@ -10501,7 +10548,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, if (ValTy->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; - Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); + Function *Ldxr = Intrinsic::getDeclaration(M, Int); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); @@ -10517,7 +10564,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; - Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); + Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateTruncOrBitCast( Builder.CreateCall(Ldxr, Addr), @@ -10527,8 +10574,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilder<> &Builder) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Builder.CreateCall( - llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); + Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); } Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 626c934f236..5c8acba26aa 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -14,16 +14,37 @@ #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/TargetRegistry.h" -#include +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include +#include +#include +#include using namespace llvm; @@ -529,19 +550,19 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, default: llvm_unreachable("Unknown branch opcode in Cond"); case AArch64::CBZW: - Is64Bit = 0; + Is64Bit = false; CC = AArch64CC::EQ; break; case AArch64::CBZX: - Is64Bit = 1; + Is64Bit = true; CC = AArch64CC::EQ; break; case AArch64::CBNZW: - Is64Bit = 0; + Is64Bit = false; CC = AArch64CC::NE; break; case AArch64::CBNZX: - Is64Bit = 1; + Is64Bit = true; CC = AArch64CC::NE; break; } @@ -1044,7 +1065,7 @@ static unsigned sForm(MachineInstr &Instr) { case AArch64::SUBSWri: case AArch64::SUBSXrr: case AArch64::SUBSXri: - return Instr.getOpcode();; + return Instr.getOpcode(); case AArch64::ADDWrr: return AArch64::ADDSWrr; case AArch64::ADDWri: return AArch64::ADDSWri; @@ -1072,12 +1093,15 @@ static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) { } namespace { + struct UsedNZCV { - bool N; - bool Z; - bool C; - bool V; - UsedNZCV(): N(false), Z(false), C(false), V(false) {} + bool N = false; + bool Z = false; + bool C = false; + bool V = false; + + UsedNZCV() = default; + UsedNZCV& operator |=(const UsedNZCV& UsedFlags) { this->N |= UsedFlags.N; this->Z |= UsedFlags.Z; @@ -1086,6 +1110,7 @@ struct UsedNZCV { return *this; } }; + } // end anonymous namespace /// Find a condition code used by the instruction. @@ -1561,7 +1586,7 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const { /// Check all MachineMemOperands for a hint to suppress pairing. bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const { - return any_of(MI.memoperands(), [](MachineMemOperand *MMO) { + return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { return MMO->getFlags() & MOSuppressPair; }); } @@ -1994,7 +2019,7 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, void AArch64InstrInfo::copyPhysRegTuple( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, - llvm::ArrayRef Indices) const { + ArrayRef Indices) const { assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); const TargetRegisterInfo *TRI = &getRegisterInfo(); @@ -2583,7 +2608,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // // // - if (MI.isCopy()) { + if (MI.isFullCopy()) { unsigned DstReg = MI.getOperand(0).getReg(); unsigned SrcReg = MI.getOperand(1).getReg(); if (SrcReg == AArch64::SP && @@ -2598,7 +2623,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( } } - // Handle the case where a copy is being spilled or refilled but the source + // Handle the case where a copy is being spilled or filled but the source // and destination register class don't match. For example: // // %vreg0 = COPY %XZR; GPR64common:%vreg0 @@ -2613,7 +2638,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // // %vreg0 = COPY %vreg1; GPR64:%vreg0, FPR64:%vreg1 // - // will be refilled as + // will be filled as // // LDRDui %vreg0, fi<#0> // @@ -2622,9 +2647,11 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // LDRXui %vregTemp, fi<#0> // %vreg0 = FMOV %vregTemp // - if (MI.isFullCopy() && Ops.size() == 1 && + if (MI.isCopy() && Ops.size() == 1 && // Make sure we're only folding the explicit COPY defs/uses. (Ops[0] == 0 || Ops[0] == 1)) { + bool IsSpill = Ops[0] == 0; + bool IsFill = !IsSpill; const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock &MBB = *MI.getParent(); @@ -2632,21 +2659,112 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( const MachineOperand &SrcMO = MI.getOperand(1); unsigned DstReg = DstMO.getReg(); unsigned SrcReg = SrcMO.getReg(); + // This is slightly expensive to compute for physical regs since + // getMinimalPhysRegClass is slow. auto getRegClass = [&](unsigned Reg) { return TargetRegisterInfo::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) : TRI.getMinimalPhysRegClass(Reg); }; - const TargetRegisterClass &DstRC = *getRegClass(DstReg); - const TargetRegisterClass &SrcRC = *getRegClass(SrcReg); - if (DstRC.getSize() == SrcRC.getSize()) { - if (Ops[0] == 0) + + if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { + assert(getRegClass(DstReg)->getSize() == getRegClass(SrcReg)->getSize() && + "Mismatched register size in non subreg COPY"); + if (IsSpill) storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, - &SrcRC, &TRI); + getRegClass(SrcReg), &TRI); else - loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, &DstRC, &TRI); + loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, + getRegClass(DstReg), &TRI); return &*--InsertPt; } + + // Handle cases like spilling def of: + // + // %vreg0:sub_32 = COPY %WZR; GPR64common:%vreg0 + // + // where the physical register source can be widened and stored to the full + // virtual reg destination stack slot, in this case producing: + // + // STRXui %XZR, + // + if (IsSpill && DstMO.isUndef() && + TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + assert(SrcMO.getSubReg() == 0 && + "Unexpected subreg on physical register"); + const TargetRegisterClass *SpillRC; + unsigned SpillSubreg; + switch (DstMO.getSubReg()) { + default: + SpillRC = nullptr; + break; + case AArch64::sub_32: + case AArch64::ssub: + if (AArch64::GPR32RegClass.contains(SrcReg)) { + SpillRC = &AArch64::GPR64RegClass; + SpillSubreg = AArch64::sub_32; + } else if (AArch64::FPR32RegClass.contains(SrcReg)) { + SpillRC = &AArch64::FPR64RegClass; + SpillSubreg = AArch64::ssub; + } else + SpillRC = nullptr; + break; + case AArch64::dsub: + if (AArch64::FPR64RegClass.contains(SrcReg)) { + SpillRC = &AArch64::FPR128RegClass; + SpillSubreg = AArch64::dsub; + } else + SpillRC = nullptr; + break; + } + + if (SpillRC) + if (unsigned WidenedSrcReg = + TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { + storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), + FrameIndex, SpillRC, &TRI); + return &*--InsertPt; + } + } + + // Handle cases like filling use of: + // + // %vreg0:sub_32 = COPY %vreg1; GPR64:%vreg0, GPR32:%vreg1 + // + // where we can load the full virtual reg source stack slot, into the subreg + // destination, in this case producing: + // + // LDRWui %vreg0:sub_32, + // + if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { + const TargetRegisterClass *FillRC; + switch (DstMO.getSubReg()) { + default: + FillRC = nullptr; + break; + case AArch64::sub_32: + FillRC = &AArch64::GPR32RegClass; + break; + case AArch64::ssub: + FillRC = &AArch64::FPR32RegClass; + break; + case AArch64::dsub: + FillRC = &AArch64::FPR64RegClass; + break; + } + + if (FillRC) { + assert(getRegClass(SrcReg)->getSize() == FillRC->getSize() && + "Mismatched regclass size on folded subreg COPY"); + loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); + MachineInstr &LoadMI = *--InsertPt; + MachineOperand &LoadDst = LoadMI.getOperand(0); + assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); + LoadDst.setSubReg(DstMO.getSubReg()); + LoadDst.setIsUndef(); + return &LoadMI; + } + } } // Cannot fold. @@ -2936,7 +3054,7 @@ bool AArch64InstrInfo::useMachineCombiner() const { return true; } -// + // True when Opc sets flag static bool isCombineInstrSettingFlag(unsigned Opc) { switch (Opc) { @@ -2955,7 +3073,7 @@ static bool isCombineInstrSettingFlag(unsigned Opc) { } return false; } -// + // 32b Opcodes that can be combined with a MUL static bool isCombineInstrCandidate32(unsigned Opc) { switch (Opc) { @@ -2974,7 +3092,7 @@ static bool isCombineInstrCandidate32(unsigned Opc) { } return false; } -// + // 64b Opcodes that can be combined with a MUL static bool isCombineInstrCandidate64(unsigned Opc) { switch (Opc) { @@ -2993,7 +3111,7 @@ static bool isCombineInstrCandidate64(unsigned Opc) { } return false; } -// + // FP Opcodes that can be combined with a FMUL static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { switch (Inst.getOpcode()) { @@ -3009,13 +3127,13 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { case AArch64::FSUBv2f32: case AArch64::FSUBv2f64: case AArch64::FSUBv4f32: - TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; - return (Options.UnsafeFPMath || - Options.AllowFPOpFusion == FPOpFusion::Fast); + TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; + return (Options.UnsafeFPMath || + Options.AllowFPOpFusion == FPOpFusion::Fast); } return false; } -// + // Opcodes that can be combined with a MUL static bool isCombineInstrCandidate(unsigned Opc) { return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); @@ -3205,7 +3323,7 @@ static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl &Patterns) { if (!isCombineInstrCandidateFP(Root)) - return 0; + return false; MachineBasicBlock &MBB = *Root.getParent(); bool Found = false; @@ -3971,8 +4089,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence( // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); DelInstrs.push_back(&Root); - - return; } /// \brief Replace csincr-branch sequence by simple conditional branch @@ -4148,6 +4264,7 @@ AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { ArrayRef> AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { using namespace AArch64II; + static const std::pair TargetFlags[] = { {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, @@ -4162,6 +4279,7 @@ AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { ArrayRef> AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { using namespace AArch64II; + static const std::pair TargetFlags[] = { {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"}, diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 90b2c089687..5037866925d 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -162,6 +162,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + // This tells target independent code that it is okay to pass instructions + // with subreg operands to foldMemoryOperandImpl. + bool isSubregFoldable() const override { return true; } + using TargetInstrInfo::foldMemoryOperandImpl; MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 20de07424c5..b51473524c7 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -1071,8 +1071,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { return false; } - const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( - (CmpInst::Predicate)I.getOperand(1).getPredicate()); + // CSINC increments the result by one when the condition code is false. + // Therefore, we have to invert the predicate to get an increment by 1 when + // the predicate is true. + const AArch64CC::CondCode invCC = + changeICMPPredToAArch64CC(CmpInst::getInversePredicate( + (CmpInst::Predicate)I.getOperand(1).getPredicate())); MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc)) .addDef(ZReg) @@ -1084,7 +1088,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { .addDef(I.getOperand(0).getReg()) .addUse(AArch64::WZR) .addUse(AArch64::WZR) - .addImm(CC); + .addImm(invCC); constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI); constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI); diff --git a/lib/Target/AArch64/AArch64InstructionSelector.h b/lib/Target/AArch64/AArch64InstructionSelector.h index 0d44e696ac2..2c6e5a912fb 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.h +++ b/lib/Target/AArch64/AArch64InstructionSelector.h @@ -17,6 +17,7 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" namespace llvm { + class AArch64InstrInfo; class AArch64RegisterBankInfo; class AArch64RegisterInfo; @@ -29,7 +30,7 @@ class AArch64InstructionSelector : public InstructionSelector { const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI); - virtual bool select(MachineInstr &I) const override; + bool select(MachineInstr &I) const override; private: /// tblgen-erated 'select' implementation, used as the initial selector for @@ -43,5 +44,6 @@ class AArch64InstructionSelector : public InstructionSelector { const AArch64RegisterBankInfo &RBI; }; -} // End llvm namespace. -#endif +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index ca2860afe13..f0bffe54415 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -14,17 +14,18 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/MC/MCLinkerOptimizationHint.h" +#include namespace llvm { /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and /// contains private AArch64-specific information for each MachineFunction. class AArch64FunctionInfo final : public MachineFunctionInfo { - /// Number of bytes of arguments this function has on the stack. If the callee /// is expected to restore the argument stack this should be a multiple of 16, /// all usable during a tail call. @@ -34,16 +35,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// space to a function with 16-bytes then misalignment of this value would /// make a stack adjustment necessary, which could not be undone by the /// callee. - unsigned BytesInStackArgArea; + unsigned BytesInStackArgArea = 0; /// The number of bytes to restore to deallocate space for incoming /// arguments. Canonically 0 in the C calling convention, but non-zero when /// callee is expected to pop the args. - unsigned ArgumentStackToRestore; + unsigned ArgumentStackToRestore = 0; /// HasStackFrame - True if this function has a stack frame. Set by /// determineCalleeSaves(). - bool HasStackFrame; + bool HasStackFrame = false; /// \brief Amount of stack frame size, not including callee-saved registers. unsigned LocalStackSize; @@ -53,54 +54,44 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// \brief Number of TLS accesses using the special (combinable) /// _TLS_MODULE_BASE_ symbol. - unsigned NumLocalDynamicTLSAccesses; + unsigned NumLocalDynamicTLSAccesses = 0; /// \brief FrameIndex for start of varargs area for arguments passed on the /// stack. - int VarArgsStackIndex; + int VarArgsStackIndex = 0; /// \brief FrameIndex for start of varargs area for arguments passed in /// general purpose registers. - int VarArgsGPRIndex; + int VarArgsGPRIndex = 0; /// \brief Size of the varargs area for arguments passed in general purpose /// registers. - unsigned VarArgsGPRSize; + unsigned VarArgsGPRSize = 0; /// \brief FrameIndex for start of varargs area for arguments passed in /// floating-point registers. - int VarArgsFPRIndex; + int VarArgsFPRIndex = 0; /// \brief Size of the varargs area for arguments passed in floating-point /// registers. - unsigned VarArgsFPRSize; + unsigned VarArgsFPRSize = 0; /// True if this function has a subset of CSRs that is handled explicitly via /// copies. - bool IsSplitCSR; + bool IsSplitCSR = false; /// True when the stack gets realigned dynamically because the size of stack /// frame is unknown at compile time. e.g., in case of VLAs. - bool StackRealigned; + bool StackRealigned = false; /// True when the callee-save stack area has unused gaps that may be used for /// other stack allocations. - bool CalleeSaveStackHasFreeSpace; + bool CalleeSaveStackHasFreeSpace = false; public: - AArch64FunctionInfo() - : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), - NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), - IsSplitCSR(false), StackRealigned(false), - CalleeSaveStackHasFreeSpace(false) {} - - explicit AArch64FunctionInfo(MachineFunction &MF) - : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), - NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), - IsSplitCSR(false), StackRealigned(false), - CalleeSaveStackHasFreeSpace(false) { + AArch64FunctionInfo() = default; + + explicit AArch64FunctionInfo(MachineFunction &MF) { (void)MF; } @@ -193,6 +184,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { MILOHContainer LOHContainerSet; SetOfInstructions LOHRelated; }; -} // End llvm namespace -#endif +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index f58bbbd2613..03e01329e03 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -71,6 +71,7 @@ void AArch64Subtarget::initializeProperties() { break; case Falkor: MaxInterleaveFactor = 4; + VectorInsertExtractBaseCost = 2; break; case Kryo: MaxInterleaveFactor = 4; diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index e4ef0d4bb8d..d2883941e2c 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -15,24 +15,35 @@ #include "AArch64InstructionSelector.h" #include "AArch64LegalizerInfo.h" #include "AArch64RegisterBankInfo.h" +#include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "AArch64TargetObjectFile.h" #include "AArch64TargetTransformInfo.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/InitializePasses.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Scalar.h" +#include +#include + using namespace llvm; static cl::opt EnableCCMP("aarch64-enable-ccmp", @@ -154,9 +165,9 @@ extern "C" void LLVMInitializeAArch64Target() { //===----------------------------------------------------------------------===// static std::unique_ptr createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) - return make_unique(); + return llvm::make_unique(); - return make_unique(); + return llvm::make_unique(); } // Helper function to build a DataLayout string @@ -202,29 +213,35 @@ AArch64TargetMachine::AArch64TargetMachine( initAsmInfo(); } -AArch64TargetMachine::~AArch64TargetMachine() {} +AArch64TargetMachine::~AArch64TargetMachine() = default; #ifdef LLVM_BUILD_GLOBAL_ISEL namespace { + struct AArch64GISelActualAccessor : public GISelAccessor { std::unique_ptr CallLoweringInfo; std::unique_ptr InstSelector; std::unique_ptr Legalizer; std::unique_ptr RegBankInfo; + const CallLowering *getCallLowering() const override { return CallLoweringInfo.get(); } + const InstructionSelector *getInstructionSelector() const override { return InstSelector.get(); } + const LegalizerInfo *getLegalizerInfo() const override { return Legalizer.get(); } + const RegisterBankInfo *getRegBankInfo() const override { return RegBankInfo.get(); } }; -} // End anonymous namespace. + +} // end anonymous namespace #endif const AArch64Subtarget * @@ -287,6 +304,7 @@ AArch64beTargetMachine::AArch64beTargetMachine( : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} namespace { + /// AArch64 Code Generator Pass Configuration Options. class AArch64PassConfig : public TargetPassConfig { public: @@ -324,7 +342,8 @@ class AArch64PassConfig : public TargetPassConfig { void addPreSched2() override; void addPreEmitPass() override; }; -} // namespace + +} // end anonymous namespace TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { return TargetIRAnalysis([this](const Function &F) { @@ -414,14 +433,17 @@ bool AArch64PassConfig::addIRTranslator() { addPass(new IRTranslator()); return false; } + bool AArch64PassConfig::addLegalizeMachineIR() { addPass(new Legalizer()); return false; } + bool AArch64PassConfig::addRegBankSelect() { addPass(new RegBankSelect()); return false; } + bool AArch64PassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); return false; diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 88c98865bbc..1a17691fc58 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -417,14 +417,17 @@ int AArch64TTIImpl::getArithmeticInstrCost( } } -int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; + int MaxMergeDistance = 64; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && SE && + !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; // In many cases the address computation is not merged into the instruction diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index 24642cb1698..849fd3d9b44 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -104,7 +104,7 @@ class AArch64TTIImpl : public BasicTTIImplBase { TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - int getAddressComputationCost(Type *Ty, bool IsComplex); + int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index db84afacf30..b86a283b40d 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -9,45 +9,62 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64MCExpr.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" #include "MCTargetDesc/AArch64TargetStreamer.h" #include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCLinkerOptimizationHint.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCAsmParserExtension.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Debug.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/MC/SubtargetFeature.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/SMLoc.h" #include "llvm/Support/TargetParser.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include +#include +#include #include +#include +#include +#include +#include + using namespace llvm; namespace { -class AArch64Operand; - class AArch64AsmParser : public MCTargetAsmParser { private: StringRef Mnemonic; ///< Instruction mnemonic. // Map of register aliases registers via the .req directive. - StringMap > RegisterReqs; + StringMap> RegisterReqs; AArch64TargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); @@ -118,6 +135,7 @@ class AArch64AsmParser : public MCTargetAsmParser { #include "AArch64GenAsmMatcher.inc" }; bool IsILP32; + AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) : MCTargetAsmParser(Options, STI) { @@ -143,9 +161,6 @@ class AArch64AsmParser : public MCTargetAsmParser { MCSymbolRefExpr::VariantKind &DarwinRefKind, int64_t &Addend); }; -} // end anonymous namespace - -namespace { /// AArch64Operand - Instances of this class represent a parsed AArch64 machine /// instruction. @@ -531,6 +546,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 0 && Val < 2); } + bool isImm0_7() const { if (!isImm()) return false; @@ -540,6 +556,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 0 && Val < 8); } + bool isImm1_8() const { if (!isImm()) return false; @@ -549,6 +566,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val > 0 && Val < 9); } + bool isImm0_15() const { if (!isImm()) return false; @@ -558,6 +576,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 0 && Val < 16); } + bool isImm1_16() const { if (!isImm()) return false; @@ -567,6 +586,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val > 0 && Val < 17); } + bool isImm0_31() const { if (!isImm()) return false; @@ -576,6 +596,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 0 && Val < 32); } + bool isImm1_31() const { if (!isImm()) return false; @@ -585,6 +606,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 1 && Val < 32); } + bool isImm1_32() const { if (!isImm()) return false; @@ -594,6 +616,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 1 && Val < 33); } + bool isImm0_63() const { if (!isImm()) return false; @@ -603,6 +626,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 0 && Val < 64); } + bool isImm1_63() const { if (!isImm()) return false; @@ -612,6 +636,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 1 && Val < 64); } + bool isImm1_64() const { if (!isImm()) return false; @@ -621,6 +646,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 1 && Val < 65); } + bool isImm0_127() const { if (!isImm()) return false; @@ -630,6 +656,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 0 && Val < 128); } + bool isImm0_255() const { if (!isImm()) return false; @@ -639,6 +666,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 0 && Val < 256); } + bool isImm0_65535() const { if (!isImm()) return false; @@ -648,6 +676,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 0 && Val < 65536); } + bool isImm32_63() const { if (!isImm()) return false; @@ -657,6 +686,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = MCE->getValue(); return (Val >= 32 && Val < 64); } + bool isLogicalImm32() const { if (!isImm()) return false; @@ -669,6 +699,7 @@ class AArch64Operand : public MCParsedAsmOperand { Val &= 0xFFFFFFFF; return AArch64_AM::isLogicalImmediate(Val, 32); } + bool isLogicalImm64() const { if (!isImm()) return false; @@ -677,6 +708,7 @@ class AArch64Operand : public MCParsedAsmOperand { return false; return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64); } + bool isLogicalImm32Not() const { if (!isImm()) return false; @@ -686,6 +718,7 @@ class AArch64Operand : public MCParsedAsmOperand { int64_t Val = ~MCE->getValue() & 0xFFFFFFFF; return AArch64_AM::isLogicalImmediate(Val, 32); } + bool isLogicalImm64Not() const { if (!isImm()) return false; @@ -694,7 +727,9 @@ class AArch64Operand : public MCParsedAsmOperand { return false; return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64); } + bool isShiftedImm() const { return Kind == k_ShiftedImm; } + bool isAddSubImm() const { if (!isShiftedImm() && !isImm()) return false; @@ -737,6 +772,7 @@ class AArch64Operand : public MCParsedAsmOperand { // code deal with it. return true; } + bool isAddSubImmNeg() const { if (!isShiftedImm() && !isImm()) return false; @@ -756,7 +792,9 @@ class AArch64Operand : public MCParsedAsmOperand { const MCConstantExpr *CE = dyn_cast(Expr); return CE != nullptr && CE->getValue() < 0 && -CE->getValue() <= 0xfff; } + bool isCondCode() const { return Kind == k_CondCode; } + bool isSIMDImmType10() const { if (!isImm()) return false; @@ -765,6 +803,7 @@ class AArch64Operand : public MCParsedAsmOperand { return false; return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue()); } + bool isBranchTarget26() const { if (!isImm()) return false; @@ -776,6 +815,7 @@ class AArch64Operand : public MCParsedAsmOperand { return false; return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2)); } + bool isPCRelLabel19() const { if (!isImm()) return false; @@ -787,6 +827,7 @@ class AArch64Operand : public MCParsedAsmOperand { return false; return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2)); } + bool isBranchTarget14() const { if (!isImm()) return false; @@ -891,40 +932,49 @@ class AArch64Operand : public MCParsedAsmOperand { bool isFPImm() const { return Kind == k_FPImm; } bool isBarrier() const { return Kind == k_Barrier; } bool isSysReg() const { return Kind == k_SysReg; } + bool isMRSSystemRegister() const { if (!isSysReg()) return false; return SysReg.MRSReg != -1U; } + bool isMSRSystemRegister() const { if (!isSysReg()) return false; return SysReg.MSRReg != -1U; } + bool isSystemPStateFieldWithImm0_1() const { if (!isSysReg()) return false; return (SysReg.PStateField == AArch64PState::PAN || SysReg.PStateField == AArch64PState::UAO); } + bool isSystemPStateFieldWithImm0_15() const { if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false; return SysReg.PStateField != -1U; } + bool isReg() const override { return Kind == k_Register && !Reg.isVector; } bool isVectorReg() const { return Kind == k_Register && Reg.isVector; } + bool isVectorRegLo() const { return Kind == k_Register && Reg.isVector && AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( Reg.RegNum); } + bool isGPR32as64() const { return Kind == k_Register && !Reg.isVector && AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum); } + bool isWSeqPair() const { return Kind == k_Register && !Reg.isVector && AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains( Reg.RegNum); } + bool isXSeqPair() const { return Kind == k_Register && !Reg.isVector && AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains( @@ -957,19 +1007,25 @@ class AArch64Operand : public MCParsedAsmOperand { bool isVectorIndex1() const { return Kind == k_VectorIndex && VectorIndex.Val == 1; } + bool isVectorIndexB() const { return Kind == k_VectorIndex && VectorIndex.Val < 16; } + bool isVectorIndexH() const { return Kind == k_VectorIndex && VectorIndex.Val < 8; } + bool isVectorIndexS() const { return Kind == k_VectorIndex && VectorIndex.Val < 4; } + bool isVectorIndexD() const { return Kind == k_VectorIndex && VectorIndex.Val < 2; } + bool isToken() const override { return Kind == k_Token; } + bool isTokenEqual(StringRef Str) const { return Kind == k_Token && getToken() == Str; } @@ -1006,6 +1062,7 @@ class AArch64Operand : public MCParsedAsmOperand { AArch64_AM::ShiftExtendType ET = getShiftExtendType(); return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX; } + bool isExtendLSL64() const { if (!isExtend()) return false; @@ -1836,11 +1893,10 @@ void AArch64Operand::print(raw_ostream &OS) const { OS << ""; break; } - case k_PSBHint: { + case k_PSBHint: OS << getPSBHintName(); break; - } - case k_ShiftExtend: { + case k_ShiftExtend: OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #" << getShiftExtendAmount(); if (!hasShiftExtendAmount()) @@ -1848,7 +1904,6 @@ void AArch64Operand::print(raw_ostream &OS) const { OS << '>'; break; } - } } /// @name Auto-generated Match Functions @@ -2469,7 +2524,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, Expr = MCConstantExpr::create(op2, getContext()); \ Operands.push_back( \ AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ - } while (0) + } while (false) if (Mnemonic == "ic") { if (!Op.compare_lower("ialluis")) { @@ -3979,7 +4034,6 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } } - switch (MatchResult) { case Match_Success: { // Perform range checking and other semantic validations @@ -4550,7 +4604,6 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, return Match_InvalidOperand; } - OperandMatchResultTy AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) { @@ -4601,7 +4654,7 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) { return MatchOperand_ParseFail; } - if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 || + if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 || (isXReg && !XRegClass.contains(SecondReg)) || (isWReg && !WRegClass.contains(SecondReg))) { Error(E,"expected second odd register of a " @@ -4610,7 +4663,7 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) { } unsigned Pair = 0; - if(isXReg) { + if (isXReg) { Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube64, &AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID]); } else { diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h index 24e353cf4b9..bc2f7f18169 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h @@ -17,15 +17,12 @@ namespace llvm { -class MCInst; -class raw_ostream; - class AArch64Disassembler : public MCDisassembler { public: AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : MCDisassembler(STI, Ctx) {} - ~AArch64Disassembler() {} + ~AArch64Disassembler() override = default; MCDisassembler::DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef Bytes, @@ -33,6 +30,6 @@ class AArch64Disassembler : public MCDisassembler { raw_ostream &CStream) const override; }; -} // namespace llvm +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index a1edb3cef46..c954c0eb2c6 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -17,25 +17,30 @@ #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixup.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" +#include +#include using namespace llvm; namespace { + class AArch64ELFObjectWriter : public MCELFObjectTargetWriter { public: AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian, bool IsILP32); - ~AArch64ELFObjectWriter() override; + ~AArch64ELFObjectWriter() override = default; protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; bool IsILP32; -private: }; -} + +} // end anonymous namespace AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian, @@ -44,8 +49,6 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, /*HasRelocationAddend*/ true), IsILP32(IsILP32) {} -AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {} - #define R_CLS(rtype) \ IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype #define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\ diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index f7058cdf237..62dfa59483e 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -15,15 +15,23 @@ #include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCFixup.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/EndianStream.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include +#include + using namespace llvm; #define DEBUG_TYPE "mccodeemitter" @@ -37,13 +45,12 @@ class AArch64MCCodeEmitter : public MCCodeEmitter { MCContext &Ctx; const MCInstrInfo &MCII; - AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT - void operator=(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT public: AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) : Ctx(ctx), MCII(mcii) {} - - ~AArch64MCCodeEmitter() override {} + AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) = delete; + void operator=(const AArch64MCCodeEmitter &) = delete; + ~AArch64MCCodeEmitter() override = default; // getBinaryCodeForInstr - TableGen'erated function for getting the // binary encoding for an instruction. @@ -181,12 +188,6 @@ class AArch64MCCodeEmitter : public MCCodeEmitter { } // end anonymous namespace -MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - MCContext &Ctx) { - return new AArch64MCCodeEmitter(MCII, Ctx); -} - /// getMachineOpValue - Return binary encoding of operand. If the machine /// operand requires relocation, record the relocation and return zero. unsigned @@ -601,3 +602,9 @@ unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison( #define ENABLE_INSTR_PREDICATE_VERIFIER #include "AArch64GenMCCodeEmitter.inc" + +MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new AArch64MCCodeEmitter(MCII, Ctx); +} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index 3e86a42d5be..1b949b54590 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -13,6 +13,7 @@ #include "AArch64TargetStreamer.h" #include "llvm/MC/ConstantPools.h" + using namespace llvm; // @@ -21,7 +22,7 @@ using namespace llvm; AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S) : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {} -AArch64TargetStreamer::~AArch64TargetStreamer() {} +AArch64TargetStreamer::~AArch64TargetStreamer() = default; // The constant pool handling is shared by all AArch64TargetStreamer // implementations. diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index a8e6902c252..4acd55eb612 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -176,12 +176,14 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { SetupMachineFunction(MF); + const AMDGPUSubtarget &STM = MF.getSubtarget(); MCContext &Context = getObjFileLowering().getContext(); - MCSectionELF *ConfigSection = - Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(ConfigSection); + if (!STM.isAmdHsaOS()) { + MCSectionELF *ConfigSection = + Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(ConfigSection); + } - const AMDGPUSubtarget &STM = MF.getSubtarget(); SIProgramInfo KernelInfo; if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { getSIProgramInfo(KernelInfo, MF); diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 85cbadf0a57..5f651d4da5d 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -269,7 +269,7 @@ unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) { unsigned encodeWaitcnt(IsaVersion Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) { - unsigned Waitcnt = getWaitcntBitMask(Version);; + unsigned Waitcnt = getWaitcntBitMask(Version); Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt); Waitcnt = encodeExpcnt(Version, Waitcnt, Expcnt); Waitcnt = encodeLgkmcnt(Version, Waitcnt, Lgkmcnt); diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 10e6297ef1e..cc001b59678 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -338,14 +338,17 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; + int MaxMergeDistance = 64; - if (Ty->isVectorTy() && IsComplex) + if (Ty->isVectorTy() && SE && + !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) return NumVectorInstToHideOverhead; // In many cases the address computation is not merged into the instruction diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index d83228afb0a..731a5adf3d7 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -104,7 +104,8 @@ class ARMTTIImpl : public BasicTTIImplBase { int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - int getAddressComputationCost(Type *Val, bool IsComplex); + int getAddressComputationCost(Type *Val, ScalarEvolution *SE, + const SCEV *Ptr); int getFPOpCost(Type *Ty); diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index 903f92a0443..57ead973b56 100644 --- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -8,23 +8,41 @@ //===----------------------------------------------------------------------===// #include "Lanai.h" +#include "LanaiAluCode.h" +#include "LanaiCondCode.h" #include "MCTargetDesc/LanaiMCExpr.h" -#include "MCTargetDesc/LanaiMCTargetDesc.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/SMLoc.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" +#include +#include +#include +#include +#include namespace llvm { + +// Auto-generated by TableGen +static unsigned MatchRegisterName(StringRef Name); + namespace { + struct LanaiOperand; class LanaiAsmParser : public MCTargetAsmParser { @@ -80,9 +98,6 @@ class LanaiAsmParser : public MCTargetAsmParser { const MCSubtargetInfo &SubtargetInfo; }; -// Auto-generated by TableGen -static unsigned MatchRegisterName(llvm::StringRef Name); - // LanaiOperand - Instances of this class represented a parsed machine // instruction struct LanaiOperand : public MCParsedAsmOperand { @@ -627,6 +642,8 @@ struct LanaiOperand : public MCParsedAsmOperand { } }; +} // end anonymous namespace + bool LanaiAsmParser::ParseDirective(AsmToken /*DirectiveId*/) { return true; } bool LanaiAsmParser::MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode, @@ -680,11 +697,11 @@ std::unique_ptr LanaiAsmParser::parseRegister() { if (Lexer.getKind() == AsmToken::Identifier) { RegNum = MatchRegisterName(Lexer.getTok().getIdentifier()); if (RegNum == 0) - return 0; + return nullptr; Parser.Lex(); // Eat identifier token return LanaiOperand::createReg(RegNum, Start, End); } - return 0; + return nullptr; } bool LanaiAsmParser::ParseRegister(unsigned &RegNum, SMLoc &StartLoc, @@ -701,15 +718,15 @@ bool LanaiAsmParser::ParseRegister(unsigned &RegNum, SMLoc &StartLoc, std::unique_ptr LanaiAsmParser::parseIdentifier() { SMLoc Start = Parser.getTok().getLoc(); SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); - const MCExpr *Res, *RHS = 0; + const MCExpr *Res, *RHS = nullptr; LanaiMCExpr::VariantKind Kind = LanaiMCExpr::VK_Lanai_None; if (Lexer.getKind() != AsmToken::Identifier) - return 0; + return nullptr; StringRef Identifier; if (Parser.parseIdentifier(Identifier)) - return 0; + return nullptr; // Check if identifier has a modifier if (Identifier.equals_lower("hi")) @@ -722,24 +739,24 @@ std::unique_ptr LanaiAsmParser::parseIdentifier() { if (Kind != LanaiMCExpr::VK_Lanai_None) { if (Lexer.getKind() != AsmToken::LParen) { Error(Lexer.getLoc(), "Expected '('"); - return 0; + return nullptr; } Lexer.Lex(); // lex '(' // Parse identifier if (Parser.parseIdentifier(Identifier)) - return 0; + return nullptr; } // If addition parse the RHS. if (Lexer.getKind() == AsmToken::Plus && Parser.parseExpression(RHS)) - return 0; + return nullptr; // For variants parse the final ')' if (Kind != LanaiMCExpr::VK_Lanai_None) { if (Lexer.getKind() != AsmToken::RParen) { Error(Lexer.getLoc(), "Expected ')'"); - return 0; + return nullptr; } Lexer.Lex(); // lex ')' } @@ -771,7 +788,7 @@ std::unique_ptr LanaiAsmParser::parseImmediate() { if (!Parser.parseExpression(ExprVal)) return LanaiOperand::createImm(ExprVal, Start, End); default: - return 0; + return nullptr; } } @@ -1204,10 +1221,9 @@ bool LanaiAsmParser::ParseInstruction(ParseInstructionInfo & /*Info*/, #define GET_REGISTER_MATCHER #define GET_MATCHER_IMPLEMENTATION #include "LanaiGenAsmMatcher.inc" -} // namespace extern "C" void LLVMInitializeLanaiAsmParser() { RegisterMCAsmParser x(getTheLanaiTarget()); } -} // namespace llvm +} // end namespace llvm diff --git a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h index a317cd88ad6..e0c19e8ea64 100644 --- a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h +++ b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h @@ -20,14 +20,11 @@ namespace llvm { -class MCInst; -class raw_ostream; - class LanaiDisassembler : public MCDisassembler { public: LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx); - ~LanaiDisassembler() override {} + ~LanaiDisassembler() override = default; // getInstruction - See MCDisassembler. MCDisassembler::DecodeStatus @@ -36,6 +33,6 @@ class LanaiDisassembler : public MCDisassembler { raw_ostream &CStream) const override; }; -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H diff --git a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h index 1c9d186ad81..59904fbaa31 100644 --- a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h +++ b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h @@ -14,10 +14,10 @@ #ifndef LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H #define LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H +#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" namespace llvm { -class MCOperand; class LanaiInstPrinter : public MCInstPrinter { public: @@ -28,14 +28,14 @@ class LanaiInstPrinter : public MCInstPrinter { void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, const MCSubtargetInfo &STI) override; void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, - const char *Modifier = 0); + const char *Modifier = nullptr); void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printMemRiOperand(const MCInst *MI, int OpNo, raw_ostream &O, - const char *Modifier = 0); + const char *Modifier = nullptr); void printMemRrOperand(const MCInst *MI, int OpNo, raw_ostream &O, - const char *Modifier = 0); + const char *Modifier = nullptr); void printMemSplsOperand(const MCInst *MI, int OpNo, raw_ostream &O, - const char *Modifier = 0); + const char *Modifier = nullptr); void printCCOperand(const MCInst *MI, int OpNo, raw_ostream &O); void printAluOperand(const MCInst *MI, int OpNo, raw_ostream &O); void printHi16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); @@ -60,6 +60,7 @@ class LanaiInstPrinter : public MCInstPrinter { bool printMemoryStoreIncrement(const MCInst *MI, raw_ostream &Ostream, StringRef Opcode, int AddOffset); }; -} // namespace llvm + +} // end namespace llvm #endif // LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp index ae7870e07d4..d156294a0b0 100644 --- a/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/lib/Target/Lanai/LanaiISelLowering.cpp @@ -11,31 +11,46 @@ // //===----------------------------------------------------------------------===// -#include "LanaiISelLowering.h" - #include "Lanai.h" +#include "LanaiCondCode.h" +#include "LanaiISelLowering.h" #include "LanaiMachineFunctionInfo.h" #include "LanaiSubtarget.h" -#include "LanaiTargetMachine.h" #include "LanaiTargetObjectFile.h" +#include "MCTargetDesc/LanaiBaseInfo.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalAlias.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetCallingConv.h" +#include "llvm/Target/TargetMachine.h" +#include +#include +#include +#include +#include #define DEBUG_TYPE "lanai-lower" @@ -195,6 +210,7 @@ SDValue LanaiTargetLowering::LowerOperation(SDValue Op, llvm_unreachable("unimplemented operand"); } } + //===----------------------------------------------------------------------===// // Lanai Inline Assembly Support //===----------------------------------------------------------------------===// @@ -244,7 +260,7 @@ LanaiTargetLowering::getSingleConstraintMatchWeight( Value *CallOperandVal = Info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. - if (CallOperandVal == NULL) + if (CallOperandVal == nullptr) return CW_Default; // Look at the constraint type. switch (*Constraint) { @@ -270,7 +286,7 @@ LanaiTargetLowering::getSingleConstraintMatchWeight( void LanaiTargetLowering::LowerAsmOperandForConstraint( SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const { - SDValue Result(0, 0); + SDValue Result(nullptr, 0); // Only support length 1 constraints for now. if (Constraint.length() > 1) @@ -676,7 +692,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo( } else { assert(VA.isMemLoc()); - if (StackPtr.getNode() == 0) + if (StackPtr.getNode() == nullptr) StackPtr = DAG.getCopyFromReg(Chain, DL, Lanai::SP, getPointerTy(DAG.getDataLayout())); @@ -1120,7 +1136,7 @@ const char *LanaiTargetLowering::getTargetNodeName(unsigned Opcode) const { case LanaiISD::SMALL: return "LanaiISD::SMALL"; default: - return NULL; + return nullptr; } } diff --git a/lib/Target/Lanai/LanaiRegisterInfo.h b/lib/Target/Lanai/LanaiRegisterInfo.h index 8b84bbc460e..c6e459076eb 100644 --- a/lib/Target/Lanai/LanaiRegisterInfo.h +++ b/lib/Target/Lanai/LanaiRegisterInfo.h @@ -21,9 +21,6 @@ namespace llvm { -class TargetInstrInfo; -class Type; - struct LanaiRegisterInfo : public LanaiGenRegisterInfo { LanaiRegisterInfo(); @@ -32,7 +29,7 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo { // Code Generation virtual methods. const uint16_t * - getCalleeSavedRegs(const MachineFunction *MF = 0) const override; + getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; BitVector getReservedRegs(const MachineFunction &MF) const override; @@ -42,7 +39,7 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo { void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, - RegScavenger *RS = NULL) const override; + RegScavenger *RS = nullptr) const override; bool canRealignStack(const MachineFunction &MF) const override; @@ -58,6 +55,6 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo { int getDwarfRegNum(unsigned RegNum, bool IsEH) const; }; -} // namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp index e30d5e9a18e..e02bba529bd 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp @@ -9,20 +9,19 @@ #include "MCTargetDesc/LanaiBaseInfo.h" #include "MCTargetDesc/LanaiFixupKinds.h" -#include "MCTargetDesc/LanaiMCTargetDesc.h" #include "llvm/MC/MCELFObjectWriter.h" -#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" using namespace llvm; namespace { + class LanaiELFObjectWriter : public MCELFObjectTargetWriter { public: explicit LanaiELFObjectWriter(uint8_t OSABI); - ~LanaiELFObjectWriter() override; + ~LanaiELFObjectWriter() override = default; protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, @@ -30,14 +29,13 @@ class LanaiELFObjectWriter : public MCELFObjectTargetWriter { bool needsRelocateWithSymbol(const MCSymbol &SD, unsigned Type) const override; }; -} // namespace + +} // end anonymous namespace LanaiELFObjectWriter::LanaiELFObjectWriter(uint8_t OSABI) : MCELFObjectTargetWriter(/*Is64Bit_=*/false, OSABI, ELF::EM_LANAI, /*HasRelocationAddend=*/true) {} -LanaiELFObjectWriter::~LanaiELFObjectWriter() {} - unsigned LanaiELFObjectWriter::getRelocType(MCContext & /*Ctx*/, const MCValue & /*Target*/, const MCFixup &Fixup, diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp index ce68b7e24db..f5b5335bb98 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp @@ -12,37 +12,38 @@ //===----------------------------------------------------------------------===// #include "Lanai.h" +#include "LanaiAluCode.h" #include "MCTargetDesc/LanaiBaseInfo.h" #include "MCTargetDesc/LanaiFixupKinds.h" #include "MCTargetDesc/LanaiMCExpr.h" -#include "MCTargetDesc/LanaiMCTargetDesc.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/raw_ostream.h" +#include +#include #define DEBUG_TYPE "mccodeemitter" STATISTIC(MCNumEmitted, "Number of MC instructions emitted"); namespace llvm { + namespace { -class LanaiMCCodeEmitter : public MCCodeEmitter { - LanaiMCCodeEmitter(const LanaiMCCodeEmitter &); // DO NOT IMPLEMENT - void operator=(const LanaiMCCodeEmitter &); // DO NOT IMPLEMENT - const MCInstrInfo &InstrInfo; - MCContext &Context; +class LanaiMCCodeEmitter : public MCCodeEmitter { public: - LanaiMCCodeEmitter(const MCInstrInfo &MCII, MCContext &C) - : InstrInfo(MCII), Context(C) {} - - ~LanaiMCCodeEmitter() override {} + LanaiMCCodeEmitter(const MCInstrInfo &MCII, MCContext &C) {} + LanaiMCCodeEmitter(const LanaiMCCodeEmitter &) = delete; + void operator=(const LanaiMCCodeEmitter &) = delete; + ~LanaiMCCodeEmitter() override = default; // The functions below are called by TableGen generated functions for getting // the binary encoding of instructions/opereands. @@ -86,6 +87,8 @@ class LanaiMCCodeEmitter : public MCCodeEmitter { const MCSubtargetInfo &STI) const; }; +} // end anonymous namespace + Lanai::Fixups FixupKind(const MCExpr *Expr) { if (isa(Expr)) return Lanai::FIXUP_LANAI_21; @@ -298,8 +301,8 @@ unsigned LanaiMCCodeEmitter::getBranchTargetOpValue( } #include "LanaiGenMCCodeEmitter.inc" -} // namespace -} // namespace llvm + +} // end namespace llvm llvm::MCCodeEmitter * llvm::createLanaiMCCodeEmitter(const MCInstrInfo &InstrInfo, diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp index c2f8c0f7ad5..a47ff9ff3d6 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp @@ -11,16 +11,21 @@ // //===----------------------------------------------------------------------===// +#include "LanaiMCAsmInfo.h" #include "LanaiMCTargetDesc.h" - #include "InstPrinter/LanaiInstPrinter.h" -#include "LanaiMCAsmInfo.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" +#include +#include #define GET_INSTRINFO_MC_DESC #include "LanaiGenInstrInfo.inc" @@ -70,7 +75,7 @@ static MCInstPrinter *createLanaiMCInstPrinter(const Triple & /*T*/, const MCRegisterInfo &MRI) { if (SyntaxVariant == 0) return new LanaiInstPrinter(MAI, MII, MRI); - return 0; + return nullptr; } static MCRelocationInfo *createLanaiElfRelocation(const Triple &TheTriple, @@ -79,6 +84,7 @@ static MCRelocationInfo *createLanaiElfRelocation(const Triple &TheTriple, } namespace { + class LanaiMCInstrAnalysis : public MCInstrAnalysis { public: explicit LanaiMCInstrAnalysis(const MCInstrInfo *Info) @@ -107,6 +113,7 @@ class LanaiMCInstrAnalysis : public MCInstrAnalysis { } } }; + } // end anonymous namespace static MCInstrAnalysis *createLanaiInstrAnalysis(const MCInstrInfo *Info) { @@ -131,7 +138,7 @@ extern "C" void LLVMInitializeLanaiTargetMC() { // Register the MC code emitter TargetRegistry::RegisterMCCodeEmitter(getTheLanaiTarget(), - llvm::createLanaiMCCodeEmitter); + createLanaiMCCodeEmitter); // Register the ASM Backend TargetRegistry::RegisterMCAsmBackend(getTheLanaiTarget(), diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index d3c88482f09..05acd25ae5f 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -47,7 +47,7 @@ namespace llvm { FCTIDZ, FCTIWZ, /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for - /// unsigned integers. + /// unsigned integers with round toward zero. FCTIDUZ, FCTIWUZ, /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 03b2257a88a..fbec8787ef8 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1154,6 +1154,9 @@ defm FCFID : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB), defm FCTID : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB), "fctid", "$frD, $frB", IIC_FPGeneral, []>, isPPC64; +defm FCTIDU : XForm_26r<63, 942, (outs f8rc:$frD), (ins f8rc:$frB), + "fctidu", "$frD, $frB", IIC_FPGeneral, + []>, isPPC64; defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB), "fctidz", "$frD, $frB", IIC_FPGeneral, [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64; diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td index 99689f656c2..ef7d2012a23 100644 --- a/lib/Target/PowerPC/PPCInstrFormats.td +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -603,6 +603,12 @@ class XForm_17 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = 0; } +class XForm_17a opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : XForm_17 { + let FRA = 0; +} + // Used for QPX class XForm_18 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index a7231bd2e2c..90111bbea07 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -2172,11 +2172,19 @@ let isCompare = 1, hasSideEffects = 0 in { "fcmpu $crD, $fA, $fB", IIC_FPCompare>; } +def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB), + "ftdiv $crD, $fA, $fB", IIC_FPCompare>; +def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB), + "ftsqrt $crD, $fB", IIC_FPCompare>; + let Uses = [RM] in { let hasSideEffects = 0 in { defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB), "fctiw", "$frD, $frB", IIC_FPGeneral, []>; + defm FCTIWU : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiwu", "$frD, $frB", IIC_FPGeneral, + []>; defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB), "fctiwz", "$frD, $frB", IIC_FPGeneral, [(set f64:$frD, (PPCfctiwz f64:$frB))]>; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index fd218939727..7f72ab17f61 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16985,10 +16985,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); } - if (Cond.getOpcode() == ISD::SETCC) { + if (Cond.getOpcode() == ISD::SETCC) if (SDValue NewCond = LowerSETCC(Cond, DAG)) Cond = NewCond; - } // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y @@ -18289,6 +18288,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, /// constant. Takes immediate version of shift as input. static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT SVT = ShAmt.getSimpleValueType(); assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); @@ -18306,27 +18306,32 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; } - const X86Subtarget &Subtarget = - static_cast(DAG.getSubtarget()); - if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && - ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { - // Let the shuffle legalizer expand this shift amount node. + // Need to build a vector containing shift amount. + // SSE/AVX packed shifts only use the lower 64-bit of the shift count. + // +=================+============+=======================================+ + // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as | + // +=================+============+=======================================+ + // | i64 | Yes, No | Use ShAmt as lowest elt | + // | i32 | Yes | zero-extend in-reg | + // | (i32 zext(i16)) | Yes | zero-extend in-reg | + // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) | + // +=================+============+=======================================+ + + if (SVT == MVT::i64) + ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt); + else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && + ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { SDValue Op0 = ShAmt.getOperand(0); Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); - ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG); + ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64); + } else if (Subtarget.hasSSE41() && + ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); + ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else { - // Need to build a vector containing shift amount. - // SSE/AVX packed shifts only use the lower 64-bit of the shift count. - SmallVector ShOps; - ShOps.push_back(ShAmt); - if (SVT == MVT::i32) { - ShOps.push_back(DAG.getConstant(0, dl, SVT)); - ShOps.push_back(DAG.getUNDEF(SVT)); - } - ShOps.push_back(DAG.getUNDEF(SVT)); - - MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; - ShAmt = DAG.getBuildVector(BVT, dl, ShOps); + SmallVector ShOps = {ShAmt, DAG.getConstant(0, dl, SVT), + DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; + ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps); } // The return type has to be a 128-bit type with the same element @@ -19014,7 +19019,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), - Op.getOperand(1), Op.getOperand(2), DAG); + Op.getOperand(1), Op.getOperand(2), Subtarget, + DAG); case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); @@ -21276,7 +21282,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, else if (EltVT.bitsLT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); - return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG); + return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG); } } @@ -25951,12 +25957,11 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, // instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, + bool FloatDomain, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); - bool FloatDomain = MaskVT.isFloatingPoint() || - (!Subtarget.hasAVX2() && MaskVT.is256BitVector()); // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && @@ -26067,11 +26072,11 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef Mask, // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, + bool FloatDomain, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); - bool FloatDomain = MaskVT.isFloatingPoint(); bool ContainsZeros = false; SmallBitVector Zeroable(NumMaskElts, false); @@ -26211,11 +26216,10 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, - SDValue &V1, SDValue &V2, + bool FloatDomain, SDValue &V1, SDValue &V2, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, bool IsUnary) { - bool FloatDomain = MaskVT.isFloatingPoint(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { @@ -26310,13 +26314,13 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef Mask, } static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef Mask, + bool FloatDomain, SDValue &V1, SDValue &V2, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); - bool FloatDomain = MaskVT.isFloatingPoint(); // Attempt to match against PALIGNR byte rotate. if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || @@ -26594,8 +26598,8 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } } - if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleSrcVT, - ShuffleVT)) { + if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle, + ShuffleSrcVT, ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26609,8 +26613,8 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return true; } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, - ShuffleVT, PermuteImm)) { + if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, + Shuffle, ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26626,8 +26630,8 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } } - if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Subtarget, Shuffle, - ShuffleVT, UnaryShuffle)) { + if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget, + Shuffle, ShuffleVT, UnaryShuffle)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26643,8 +26647,9 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return true; } - if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, V1, V2, DL, DAG, Subtarget, - Shuffle, ShuffleVT, PermuteImm)) { + if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL, + DAG, Subtarget, Shuffle, ShuffleVT, + PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -28742,6 +28747,27 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, DAG.getConstant(Imm, DL, MVT::i8))); return true; } + case ISD::EXTRACT_SUBVECTOR: { + unsigned EltSize = EltVT.getSizeInBits(); + if (EltSize != 32 && EltSize != 64) + return false; + MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); + // Only change element size, not type. + if (VT.isInteger() != OpEltVT.isInteger()) + return false; + uint64_t Imm = cast(Op.getOperand(1))->getZExtValue(); + Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; + // Op0 needs to be bitcasted to a larger vector with the same element type. + SDValue Op0 = Op.getOperand(0); + MVT Op0VT = MVT::getVectorVT(EltVT, + Op0.getSimpleValueType().getSizeInBits() / EltSize); + Op0 = DAG.getBitcast(Op0VT, Op0); + DCI.AddToWorklist(Op0.getNode()); + DCI.CombineTo(OrigOp.getNode(), + DAG.getNode(Opcode, DL, VT, Op0, + DAG.getConstant(Imm, DL, MVT::i8))); + return true; + } } return false; @@ -30921,6 +30947,59 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); } +/// Check if truncation with saturation form type \p SrcVT to \p DstVT +/// is valid for the given \p Subtarget. +static bool +isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX512()) + return false; + EVT SrcElVT = SrcVT.getScalarType(); + EVT DstElVT = DstVT.getScalarType(); + if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64) + return false; + if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32) + return false; + if (SrcVT.is512BitVector() || Subtarget.hasVLX()) + return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI(); + return false; +} + +/// Detect a pattern of truncation with saturation: +/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// Return the source value to be truncated or SDValue() if the pattern was not +/// matched or the unsupported on the current target. +static SDValue +detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) { + if (In.getOpcode() != ISD::UMIN) + return SDValue(); + + EVT InVT = In.getValueType(); + // FIXME: Scalar type may be supported if we move it to vector register. + if (!InVT.isVector() || !InVT.isSimple()) + return SDValue(); + + if (!isSATValidOnSubtarget(InVT, VT, Subtarget)) + return SDValue(); + + //Saturation with truncation. We truncate from InVT to VT. + assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && + "Unexpected types for truncate operation"); + + SDValue SrcVal; + APInt C; + if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C)) + SrcVal = In.getOperand(1); + else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) + SrcVal = In.getOperand(0); + else + return SDValue(); + + // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according + // the element size of the destination type. + return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ? + SrcVal : SDValue(); +} + /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient /// X86ISD::AVG instruction. @@ -31487,6 +31566,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); + if (SDValue Val = + detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget)) + return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), + dl, Val, St->getBasePtr(), + St->getMemoryVT(), St->getMemOperand(), DAG); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); @@ -31967,7 +32052,8 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. static SDValue -combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, +combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget, + SelectionDAG &DAG, SmallVector &Regs) { assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); EVT OutVT = N->getValueType(0); @@ -31976,8 +32062,10 @@ combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, // Shift left by 16 bits, then arithmetic-shift right by 16 bits. SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); for (auto &Reg : Regs) { - Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); - Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); + Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, + Subtarget, DAG); + Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, + Subtarget, DAG); } for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) @@ -32046,7 +32134,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasSSE41() || OutSVT == MVT::i8) return combineVectorTruncationWithPACKUS(N, DAG, SubVec); else if (InSVT == MVT::i32) - return combineVectorTruncationWithPACKSS(N, DAG, SubVec); + return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec); else return SDValue(); } @@ -32104,6 +32192,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; + // Try the truncation with unsigned saturation. + if (SDValue Val = detectUSatPattern(Src, VT, Subtarget)) + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val); + // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index d7792e296a5..de4839432b9 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -80,9 +80,12 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { if (Vector) { - if (ST->hasAVX512()) return 512; - if (ST->hasAVX()) return 256; - if (ST->hasSSE1()) return 128; + if (ST->hasAVX512()) + return 512; + if (ST->hasAVX()) + return 256; + if (ST->hasSSE1()) + return 128; return 0; } @@ -211,11 +214,9 @@ int X86TTIImpl::getArithmeticInstrCost( }; // Look for AVX512DQ lowering tricks for custom cases. - if (ST->hasDQI()) { - if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, - LT.second)) + if (ST->hasDQI()) + if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } static const CostTblEntry AVX512BWCostTable[] = { { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. @@ -225,37 +226,38 @@ int X86TTIImpl::getArithmeticInstrCost( // Vectorizing division is a bad idea. See the SSE2 table for more comments. { ISD::SDIV, MVT::v64i8, 64*20 }, { ISD::SDIV, MVT::v32i16, 32*20 }, - { ISD::SDIV, MVT::v16i32, 16*20 }, - { ISD::SDIV, MVT::v8i64, 8*20 }, { ISD::UDIV, MVT::v64i8, 64*20 }, - { ISD::UDIV, MVT::v32i16, 32*20 }, - { ISD::UDIV, MVT::v16i32, 16*20 }, - { ISD::UDIV, MVT::v8i64, 8*20 }, + { ISD::UDIV, MVT::v32i16, 32*20 } }; // Look for AVX512BW lowering tricks for custom cases. - if (ST->hasBWI()) { - if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, - LT.second)) + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } static const CostTblEntry AVX512CostTable[] = { - { ISD::SHL, MVT::v16i32, 1 }, - { ISD::SRL, MVT::v16i32, 1 }, - { ISD::SRA, MVT::v16i32, 1 }, - { ISD::SHL, MVT::v8i64, 1 }, - { ISD::SRL, MVT::v8i64, 1 }, - { ISD::SRA, MVT::v8i64, 1 }, - - { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. + { ISD::SHL, MVT::v16i32, 1 }, + { ISD::SRL, MVT::v16i32, 1 }, + { ISD::SRA, MVT::v16i32, 1 }, + { ISD::SHL, MVT::v8i64, 1 }, + { ISD::SRL, MVT::v8i64, 1 }, + { ISD::SRA, MVT::v8i64, 1 }, + + { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v16i32, 1 }, // pmulld + { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add + + // Vectorizing division is a bad idea. See the SSE2 table for more comments. + { ISD::SDIV, MVT::v16i32, 16*20 }, + { ISD::SDIV, MVT::v8i64, 8*20 }, + { ISD::UDIV, MVT::v16i32, 16*20 }, + { ISD::UDIV, MVT::v8i64, 8*20 } }; - if (ST->hasAVX512()) { + if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } static const CostTblEntry AVX2CostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to @@ -315,10 +317,9 @@ int X86TTIImpl::getArithmeticInstrCost( }; // Look for XOP lowering tricks. - if (ST->hasXOP()) { + if (ST->hasXOP()) if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } static const CostTblEntry AVX2CustomCostTable[] = { { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. @@ -334,6 +335,8 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v8i32, 1 }, // pmulld + { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ @@ -344,11 +347,10 @@ int X86TTIImpl::getArithmeticInstrCost( }; // Look for AVX2 lowering tricks for custom cases. - if (ST->hasAVX2()) { + if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } static const CostTblEntry AVXCustomCostTable[] = { { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. @@ -372,24 +374,10 @@ int X86TTIImpl::getArithmeticInstrCost( }; // Look for AVX2 lowering tricks for custom cases. - if (ST->hasAVX()) { + if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } - - static const CostTblEntry SSE42FloatCostTable[] = { - { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ - { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ - { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ - { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ - }; - - if (ST->hasSSE42()) { - if (const auto *Entry = CostTableLookup(SSE42FloatCostTable, ISD, - LT.second)) - return LT.first * Entry->Cost; - } static const CostTblEntry SSE2UniformCostTable[] = { @@ -452,6 +440,17 @@ int X86TTIImpl::getArithmeticInstrCost( ISD = ISD::MUL; } + static const CostTblEntry SSE42CostTable[] = { + { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ + }; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + static const CostTblEntry SSE41CostTable[] = { { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence. @@ -471,44 +470,39 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence. { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend. + + { ISD::MUL, MVT::v4i32, 1 } // pmulld }; - if (ST->hasSSE41()) { + if (ST->hasSSE41()) if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend. { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. - { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence. { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend. { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence. { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle + { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ @@ -531,10 +525,9 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::UDIV, MVT::v2i64, 2*20 }, }; - if (ST->hasSSE2()) { + if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } static const CostTblEntry AVX1CostTable[] = { // We don't have to scalarize unsupported ops. We can issue two half-sized @@ -553,307 +546,278 @@ int X86TTIImpl::getArithmeticInstrCost( // A v4i64 multiply is custom lowered as two split v2i64 vectors that then // are lowered as a series of long multiplies(3), shifts(3) and adds(2) // Because we believe v4i64 to be a legal type, we must also include the - // split factor of two in the cost table. Therefore, the cost here is 16 + // extract+insert in the cost table. Therefore, the cost here is 18 // instead of 8. - { ISD::MUL, MVT::v4i64, 16 }, + { ISD::MUL, MVT::v4i64, 18 }, }; // Look for AVX1 lowering tricks. - if (ST->hasAVX() && !ST->hasAVX2()) { - MVT VT = LT.second; - - if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT)) + if (ST->hasAVX() && !ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) return LT.first * Entry->Cost; - } - // Custom lowering of vectors. - static const CostTblEntry CustomLowered[] = { - // A v2i64/v4i64 and multiply is custom lowered as a series of long - // multiplies(3), shifts(3) and adds(2). - { ISD::MUL, MVT::v2i64, 8 }, - { ISD::MUL, MVT::v4i64, 8 }, - { ISD::MUL, MVT::v8i64, 8 } - }; - if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second)) - return LT.first * Entry->Cost; - - // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, - // 2x pmuludq, 2x shuffle. - if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && - !ST->hasSSE41()) - return LT.first * 6; - - static const CostTblEntry SSE1FloatCostTable[] = { + static const CostTblEntry SSE1CostTable[] = { { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ }; if (ST->hasSSE1()) - if (const auto *Entry = CostTableLookup(SSE1FloatCostTable, ISD, - LT.second)) + if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) return LT.first * Entry->Cost; + // Fallback to the default implementation. return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); } int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) { - // 64-bit packed float vectors (v2f32) are widened to type v4f32. - // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); - - static const CostTblEntry AVX512VBMIShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb - { TTI::SK_Reverse, MVT::v32i8, 1 } // vpermb - }; - - if (ST->hasVBMI()) - if (const auto *Entry = - CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; + // 64-bit packed float vectors (v2f32) are widened to type v4f32. + // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + + // For Broadcasts we are splatting the first element from the first input + // register, so only need to reference that input and all the output + // registers are the same. + if (Kind == TTI::SK_Broadcast) + LT.first = 1; + + // We are going to permute multiple sources and the result will be in multiple + // destinations. Providing an accurate cost only for splits where the element + // type remains the same. + if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { + MVT LegalVT = LT.second; + if (LegalVT.getVectorElementType().getSizeInBits() == + Tp->getVectorElementType()->getPrimitiveSizeInBits() && + LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { + + unsigned VecTySize = DL.getTypeStoreSize(Tp); + unsigned LegalVTSize = LegalVT.getStoreSize(); + // Number of source vectors after legalization: + unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; + // Number of destination vectors after legalization: + unsigned NumOfDests = LT.first; + + Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), + LegalVT.getVectorNumElements()); + + unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; + return NumOfShuffles * + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); + } - static const CostTblEntry AVX512BWShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw - { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw - { TTI::SK_Reverse, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128 - // + 2*pshufb + vinserti64x4 - }; + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + } - if (ST->hasBWI()) - if (const auto *Entry = - CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; + // For 2-input shuffles, we must account for splitting the 2 inputs into many. + if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { + // We assume that source and destination have the same vector type. + int NumOfDests = LT.first; + int NumOfShufflesPerDest = LT.first * 2 - 1; + LT.first = NumOfDests * NumOfShufflesPerDest; + } - static const CostTblEntry AVX512ShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd - { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps - { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq - { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd - }; + static const CostTblEntry AVX512VBMIShuffleTbl[] = { + { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb + { TTI::SK_Reverse, MVT::v32i8, 1 }, // vpermb - if (ST->hasAVX512()) - if (const auto *Entry = - CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; + { TTI::SK_PermuteSingleSrc, MVT::v64i8, 1 }, // vpermb + { TTI::SK_PermuteSingleSrc, MVT::v32i8, 1 }, // vpermb - static const CostTblEntry AVX2ShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd - { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps - { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq - { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd - { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb - { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb + { TTI::SK_PermuteTwoSrc, MVT::v64i8, 1 }, // vpermt2b + { TTI::SK_PermuteTwoSrc, MVT::v32i8, 1 }, // vpermt2b + { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 } // vpermt2b + }; - { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw - { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb - }; + if (ST->hasVBMI()) + if (const auto *Entry = + CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - if (ST->hasAVX2()) - if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; + static const CostTblEntry AVX512BWShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb + + { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw + { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw + { TTI::SK_Reverse, MVT::v64i8, 6 }, // vextracti64x4 + 2*vperm2i128 + // + 2*pshufb + vinserti64x4 + + { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw + { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw + { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // vpermw + { TTI::SK_PermuteSingleSrc, MVT::v64i8, 8 }, // extend to v32i16 + { TTI::SK_PermuteSingleSrc, MVT::v32i8, 3 }, // vpermw + zext/trunc + + { TTI::SK_PermuteTwoSrc, MVT::v32i16, 1 }, // vpermt2w + { TTI::SK_PermuteTwoSrc, MVT::v16i16, 1 }, // vpermt2w + { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpermt2w + { TTI::SK_PermuteTwoSrc, MVT::v32i8, 3 }, // zext + vpermt2w + trunc + { TTI::SK_PermuteTwoSrc, MVT::v64i8, 19 }, // 6 * v32i8 + 1 + { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 } // zext + vpermt2w + trunc + }; - static const CostTblEntry AVX1ShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd - { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps - { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd - { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps - { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb - // + vinsertf128 - { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb - // + vinsertf128 - - { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd - { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd - { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps - { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps - { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor - { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor - }; + if (ST->hasBWI()) + if (const auto *Entry = + CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; + static const CostTblEntry AVX512ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd + { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps + { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq + { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd + + { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd + { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps + { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq + { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd + + { TTI::SK_PermuteSingleSrc, MVT::v8f64, 1 }, // vpermpd + { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd + { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // vpermpd + { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps + { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps + { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // vpermps + { TTI::SK_PermuteSingleSrc, MVT::v8i64, 1 }, // vpermq + { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq + { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // vpermq + { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd + { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd + { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // vpermd + { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb + + { TTI::SK_PermuteTwoSrc, MVT::v8f64, 1 }, // vpermt2pd + { TTI::SK_PermuteTwoSrc, MVT::v16f32, 1 }, // vpermt2ps + { TTI::SK_PermuteTwoSrc, MVT::v8i64, 1 }, // vpermt2q + { TTI::SK_PermuteTwoSrc, MVT::v16i32, 1 }, // vpermt2d + { TTI::SK_PermuteTwoSrc, MVT::v4f64, 1 }, // vpermt2pd + { TTI::SK_PermuteTwoSrc, MVT::v8f32, 1 }, // vpermt2ps + { TTI::SK_PermuteTwoSrc, MVT::v4i64, 1 }, // vpermt2q + { TTI::SK_PermuteTwoSrc, MVT::v8i32, 1 }, // vpermt2d + { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // vpermt2pd + { TTI::SK_PermuteTwoSrc, MVT::v4f32, 1 }, // vpermt2ps + { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // vpermt2q + { TTI::SK_PermuteTwoSrc, MVT::v4i32, 1 } // vpermt2d + }; - static const CostTblEntry SSE41ShuffleTbl[] = { - { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw - { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd - { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw - { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps - { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw - { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb - }; - - if (ST->hasSSE41()) - if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry SSSE3ShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb - { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb + static const CostTblEntry AVX2ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd + { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps + { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq + { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd + { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb + + { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd + { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps + { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq + { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd + { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb + { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb + + { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw + { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb + }; - { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por - { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por - }; + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - if (ST->hasSSSE3()) - if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; + static const CostTblEntry AVX1ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd + { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps + { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd + { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps + { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128 + { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128 + + { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd + { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps + { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd + { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps + { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb + // + vinsertf128 + { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb + // + vinsertf128 + + { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd + { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd + { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps + { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps + { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor + { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor + }; - static const CostTblEntry SSE2ShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd - { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd - { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd - { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd - { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw - // + 2*pshufd + 2*unpck + packus - - { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd - { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd - { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps - { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por - { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por - }; - - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry SSE1ShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps - { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps - }; + static const CostTblEntry SSE41ShuffleTbl[] = { + { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw + { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd + { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw + { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps + { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw + { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb + }; - if (ST->hasSSE1()) - if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) - return LT.first * Entry->Cost; + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; - } else if (Kind == TTI::SK_PermuteTwoSrc) { - // We assume that source and destination have the same vector type. - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); - int NumOfDests = LT.first; - int NumOfShufflesPerDest = LT.first * 2 - 1; - int NumOfShuffles = NumOfDests * NumOfShufflesPerDest; - - static const CostTblEntry AVX512VBMIShuffleTbl[] = { - {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b - {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // vpermt2b - }; - - if (ST->hasVBMI()) - if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl, - ISD::VECTOR_SHUFFLE, LT.second)) - return NumOfShuffles * Entry->Cost; - - static const CostTblEntry AVX512BWShuffleTbl[] = { - {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w - {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermt2w - {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}, // zext + vpermt2w + trunc - {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1 - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // zext + vpermt2w + trunc - }; - - if (ST->hasBWI()) - if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, - ISD::VECTOR_SHUFFLE, LT.second)) - return NumOfShuffles * Entry->Cost; - - static const CostTblEntry AVX512ShuffleTbl[] = { - {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermt2pd - {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps - {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermt2q - {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d - {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermt2pd - {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermt2ps - {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermt2q - {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermt2d - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermt2pd - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermt2ps - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermt2q - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1} // vpermt2d - }; + static const CostTblEntry SSSE3ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb + { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb - if (ST->hasAVX512()) - if (const auto *Entry = - CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) - return NumOfShuffles * Entry->Cost; - - } else if (Kind == TTI::SK_PermuteSingleSrc) { - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); - if (LT.first == 1) { - - static const CostTblEntry AVX512VBMIShuffleTbl[] = { - {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb - {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1} // vpermb - }; - - if (ST->hasVBMI()) - if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl, - ISD::VECTOR_SHUFFLE, LT.second)) - return Entry->Cost; - - static const CostTblEntry AVX512BWShuffleTbl[] = { - {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw - {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw - {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, // vpermw - {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8}, // extend to v32i16 - {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3} // vpermw + zext/trunc - }; - - if (ST->hasBWI()) - if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, - ISD::VECTOR_SHUFFLE, LT.second)) - return Entry->Cost; - - static const CostTblEntry AVX512ShuffleTbl[] = { - {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1}, // vpermpd - {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vpermpd - {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // vpermpd - {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps - {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vpermps - {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, // vpermps - {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1}, // vpermq - {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vpermq - {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // vpermq - {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd - {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vpermd - {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, // vpermd - {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1} // pshufb - }; - - if (ST->hasAVX512()) - if (const auto *Entry = - CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) - return Entry->Cost; - - } else { - // We are going to permute multiple sources and the result will be in - // multiple destinations. Providing an accurate cost only for splits where - // the element type remains the same. - - MVT LegalVT = LT.second; - if (LegalVT.getVectorElementType().getSizeInBits() == - Tp->getVectorElementType()->getPrimitiveSizeInBits() && - LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { - - unsigned VecTySize = DL.getTypeStoreSize(Tp); - unsigned LegalVTSize = LegalVT.getStoreSize(); - // Number of source vectors after legalization: - unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; - // Number of destination vectors after legalization: - unsigned NumOfDests = LT.first; - - Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), - LegalVT.getVectorNumElements()); - - unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; - return NumOfShuffles * - getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); - } - } - } + { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb + { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb + + { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por + { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por + }; + + if (ST->hasSSSE3()) + if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry SSE2ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd + { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd + { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd + { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd + { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd + + { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd + { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd + { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd + { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd + { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw + // + 2*pshufd + 2*unpck + packus + + { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd + { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd + { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps + { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por + { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por + }; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry SSE1ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps + { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps + { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps + }; + + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } @@ -1623,17 +1587,29 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, return Cost+LT.first; } -int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting // extra micro-ops can significantly decrease throughput. unsigned NumVectorInstToHideOverhead = 10; - if (Ty->isVectorTy() && IsComplex) - return NumVectorInstToHideOverhead; + // Cost modeling of Strided Access Computation is hidden by the indexing + // modes of X86 regardless of the stride value. We dont believe that there + // is a difference between constant strided access in gerenal and constant + // strided value which is less than or equal to 64. + // Even in the case of (loop invariant) stride whose value is not known at + // compile time, the address computation will not incur more than one extra + // ADD instruction. + if (Ty->isVectorTy() && SE) { + if (!BaseT::isStridedAccess(Ptr)) + return NumVectorInstToHideOverhead; + if (!BaseT::getConstantStrideStep(SE, Ptr)) + return 1; + } - return BaseT::getAddressComputationCost(Ty, IsComplex); + return BaseT::getAddressComputationCost(Ty, SE, Ptr); } int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index f6bcb9f569e..c013805f432 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -71,7 +71,8 @@ class X86TTIImpl : public BasicTTIImplBase { unsigned AddressSpace); int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment); - int getAddressComputationCost(Type *PtrTy, bool IsComplex); + int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, + const SCEV *Ptr); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF); diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index 6dd95f8dcd5..6b32f6c31f7 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -36,7 +36,10 @@ using namespace llvm; -STATISTIC(NumImported, "Number of functions imported"); +STATISTIC(NumImportedFunctions, "Number of functions imported"); +STATISTIC(NumImportedModules, "Number of modules imported from"); +STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index"); +STATISTIC(NumLiveSymbols, "Number of live symbols in index"); /// Limit on instruction count of imported functions. static cl::opt ImportInstrLimit( @@ -69,6 +72,9 @@ static cl::opt ImportColdMultiplier( static cl::opt PrintImports("print-imports", cl::init(false), cl::Hidden, cl::desc("Print imported functions")); +static cl::opt ComputeDead("compute-dead", cl::init(true), cl::Hidden, + cl::desc("Compute dead symbols")); + // Temporary allows the function import pass to disable always linking // referenced discardable symbols. static cl::opt @@ -105,78 +111,6 @@ static std::unique_ptr loadFile(const std::string &FileName, namespace { -// Return true if the Summary describes a GlobalValue that can be externally -// referenced, i.e. it does not need renaming (linkage is not local) or renaming -// is possible (does not have a section for instance). -static bool canBeExternallyReferenced(const GlobalValueSummary &Summary) { - if (!Summary.needsRenaming()) - return true; - - if (Summary.noRename()) - // Can't externally reference a global that needs renaming if has a section - // or is referenced from inline assembly, for example. - return false; - - return true; -} - -// Return true if \p GUID describes a GlobalValue that can be externally -// referenced, i.e. it does not need renaming (linkage is not local) or -// renaming is possible (does not have a section for instance). -static bool canBeExternallyReferenced(const ModuleSummaryIndex &Index, - GlobalValue::GUID GUID) { - auto Summaries = Index.findGlobalValueSummaryList(GUID); - if (Summaries == Index.end()) - return true; - if (Summaries->second.size() != 1) - // If there are multiple globals with this GUID, then we know it is - // not a local symbol, and it is necessarily externally referenced. - return true; - - // We don't need to check for the module path, because if it can't be - // externally referenced and we call it, it is necessarilly in the same - // module - return canBeExternallyReferenced(**Summaries->second.begin()); -} - -// Return true if the global described by \p Summary can be imported in another -// module. -static bool eligibleForImport(const ModuleSummaryIndex &Index, - const GlobalValueSummary &Summary) { - if (!canBeExternallyReferenced(Summary)) - // Can't import a global that needs renaming if has a section for instance. - // FIXME: we may be able to import it by copying it without promotion. - return false; - - // Don't import functions that are not viable to inline. - if (Summary.isNotViableToInline()) - return false; - - // Check references (and potential calls) in the same module. If the current - // value references a global that can't be externally referenced it is not - // eligible for import. First check the flag set when we have possible - // opaque references (e.g. inline asm calls), then check the call and - // reference sets. - if (Summary.hasInlineAsmMaybeReferencingInternal()) - return false; - bool AllRefsCanBeExternallyReferenced = - llvm::all_of(Summary.refs(), [&](const ValueInfo &VI) { - return canBeExternallyReferenced(Index, VI.getGUID()); - }); - if (!AllRefsCanBeExternallyReferenced) - return false; - - if (auto *FuncSummary = dyn_cast(&Summary)) { - bool AllCallsCanBeExternallyReferenced = llvm::all_of( - FuncSummary->calls(), [&](const FunctionSummary::EdgeTy &Edge) { - return canBeExternallyReferenced(Index, Edge.first.getGUID()); - }); - if (!AllCallsCanBeExternallyReferenced) - return false; - } - return true; -} - /// Given a list of possible callee implementation for a call site, select one /// that fits the \p Threshold. /// @@ -214,7 +148,7 @@ selectCallee(const ModuleSummaryIndex &Index, if (Summary->instCount() > Threshold) return false; - if (!eligibleForImport(Index, *Summary)) + if (Summary->notEligibleToImport()) return false; return true; @@ -346,7 +280,8 @@ static void computeImportForFunction( static void ComputeImportForModule( const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index, FunctionImporter::ImportMapTy &ImportList, - StringMap *ExportLists = nullptr) { + StringMap *ExportLists = nullptr, + const DenseSet *DeadSymbols = nullptr) { // Worklist contains the list of function imported in this module, for which // we will analyse the callees and may import further down the callgraph. SmallVector Worklist; @@ -354,6 +289,10 @@ static void ComputeImportForModule( // Populate the worklist with the import for the functions in the current // module for (auto &GVSummary : DefinedGVSummaries) { + if (DeadSymbols && DeadSymbols->count(GVSummary.first)) { + DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n"); + continue; + } auto *Summary = GVSummary.second; if (auto *AS = dyn_cast(Summary)) Summary = &AS->getAliasee(); @@ -393,14 +332,15 @@ void llvm::ComputeCrossModuleImport( const ModuleSummaryIndex &Index, const StringMap &ModuleToDefinedGVSummaries, StringMap &ImportLists, - StringMap &ExportLists) { + StringMap &ExportLists, + const DenseSet *DeadSymbols) { // For each module that has function defined, compute the import/export lists. for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) { auto &ImportList = ImportLists[DefinedGVSummaries.first()]; DEBUG(dbgs() << "Computing import for Module '" << DefinedGVSummaries.first() << "'\n"); ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList, - &ExportLists); + &ExportLists, DeadSymbols); } // When computing imports we added all GUIDs referenced by anything @@ -462,6 +402,86 @@ void llvm::ComputeCrossModuleImportForModule( #endif } +DenseSet llvm::computeDeadSymbols( + const ModuleSummaryIndex &Index, + const DenseSet &GUIDPreservedSymbols) { + if (!ComputeDead) + return DenseSet(); + if (GUIDPreservedSymbols.empty()) + // Don't do anything when nothing is live, this is friendly with tests. + return DenseSet(); + DenseSet LiveSymbols = GUIDPreservedSymbols; + SmallVector Worklist; + Worklist.reserve(LiveSymbols.size() * 2); + for (auto GUID : LiveSymbols) { + DEBUG(dbgs() << "Live root: " << GUID << "\n"); + Worklist.push_back(GUID); + } + // Add values flagged in the index as live roots to the worklist. + for (const auto &Entry : Index) { + bool IsLiveRoot = llvm::any_of( + Entry.second, + [&](const std::unique_ptr &Summary) { + return Summary->liveRoot(); + }); + if (!IsLiveRoot) + continue; + DEBUG(dbgs() << "Live root (summary): " << Entry.first << "\n"); + Worklist.push_back(Entry.first); + } + + while (!Worklist.empty()) { + auto GUID = Worklist.pop_back_val(); + auto It = Index.findGlobalValueSummaryList(GUID); + if (It == Index.end()) { + DEBUG(dbgs() << "Not in index: " << GUID << "\n"); + continue; + } + + // FIXME: we should only make the prevailing copy live here + for (auto &Summary : It->second) { + for (auto Ref : Summary->refs()) { + auto RefGUID = Ref.getGUID(); + if (LiveSymbols.insert(RefGUID).second) { + DEBUG(dbgs() << "Marking live (ref): " << RefGUID << "\n"); + Worklist.push_back(RefGUID); + } + } + if (auto *FS = dyn_cast(Summary.get())) { + for (auto Call : FS->calls()) { + auto CallGUID = Call.first.getGUID(); + if (LiveSymbols.insert(CallGUID).second) { + DEBUG(dbgs() << "Marking live (call): " << CallGUID << "\n"); + Worklist.push_back(CallGUID); + } + } + } + if (auto *AS = dyn_cast(Summary.get())) { + auto AliaseeGUID = AS->getAliasee().getOriginalName(); + if (LiveSymbols.insert(AliaseeGUID).second) { + DEBUG(dbgs() << "Marking live (alias): " << AliaseeGUID << "\n"); + Worklist.push_back(AliaseeGUID); + } + } + } + } + DenseSet DeadSymbols; + DeadSymbols.reserve( + std::min(Index.size(), Index.size() - LiveSymbols.size())); + for (auto &Entry : Index) { + auto GUID = Entry.first; + if (!LiveSymbols.count(GUID)) { + DEBUG(dbgs() << "Marking dead: " << GUID << "\n"); + DeadSymbols.insert(GUID); + } + } + DEBUG(dbgs() << LiveSymbols.size() << " symbols Live, and " + << DeadSymbols.size() << " symbols Dead \n"); + NumDeadSymbols += DeadSymbols.size(); + NumLiveSymbols += LiveSymbols.size(); + return DeadSymbols; +} + /// Compute the set of summaries needed for a ThinLTO backend compilation of /// \p ModulePath. void llvm::gatherImportedSummariesForModule( @@ -625,7 +645,6 @@ Expected FunctionImporter::importFunctions( // now, before linking it (otherwise this will be a noop). if (Error Err = SrcModule->materializeMetadata()) return std::move(Err); - UpgradeDebugInfo(*SrcModule); auto &ImportGUIDs = FunctionsToImportPerModule->second; // Find the globals to import @@ -698,6 +717,10 @@ Expected FunctionImporter::importFunctions( } } + // Upgrade debug info after we're done materializing all the globals and we + // have loaded all the required metadata! + UpgradeDebugInfo(*SrcModule); + // Link in the specified functions. if (renameModuleForThinLTO(*SrcModule, Index, &GlobalsToImport)) return true; @@ -717,9 +740,10 @@ Expected FunctionImporter::importFunctions( report_fatal_error("Function Import: link error"); ImportedCount += GlobalsToImport.size(); + NumImportedModules++; } - NumImported += ImportedCount; + NumImportedFunctions += ImportedCount; DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module " << DestModule.getModuleIdentifier() << "\n"); diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp index 2948878cffc..f4742aaf748 100644 --- a/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/lib/Transforms/IPO/LowerTypeTests.cpp @@ -27,9 +27,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ModuleSummaryIndexYAML.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/TrailingObjects.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" @@ -52,6 +55,20 @@ static cl::opt AvoidReuse( cl::desc("Try to avoid reuse of byte array addresses using aliases"), cl::Hidden, cl::init(true)); +static cl::opt ClSummaryAction( + "lowertypetests-summary-action", + cl::desc("What to do with the summary when running this pass"), cl::Hidden); + +static cl::opt ClReadSummary( + "lowertypetests-read-summary", + cl::desc("Read summary from given YAML file before running pass"), + cl::Hidden); + +static cl::opt ClWriteSummary( + "lowertypetests-write-summary", + cl::desc("Write summary to given YAML file after running pass"), + cl::Hidden); + bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const { if (Offset < ByteOffset) return false; @@ -66,38 +83,6 @@ bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const { return Bits.count(BitOffset); } -bool BitSetInfo::containsValue( - const DataLayout &DL, - const DenseMap &GlobalLayout, Value *V, - uint64_t COffset) const { - if (auto GV = dyn_cast(V)) { - auto I = GlobalLayout.find(GV); - if (I == GlobalLayout.end()) - return false; - return containsGlobalOffset(I->second + COffset); - } - - if (auto GEP = dyn_cast(V)) { - APInt APOffset(DL.getPointerSizeInBits(0), 0); - bool Result = GEP->accumulateConstantOffset(DL, APOffset); - if (!Result) - return false; - COffset += APOffset.getZExtValue(); - return containsValue(DL, GlobalLayout, GEP->getPointerOperand(), COffset); - } - - if (auto Op = dyn_cast(V)) { - if (Op->getOpcode() == Instruction::BitCast) - return containsValue(DL, GlobalLayout, Op->getOperand(0), COffset); - - if (Op->getOpcode() == Instruction::Select) - return containsValue(DL, GlobalLayout, Op->getOperand(1), COffset) && - containsValue(DL, GlobalLayout, Op->getOperand(2), COffset); - } - - return false; -} - void BitSetInfo::print(raw_ostream &OS) const { OS << "offset " << ByteOffset << " size " << BitSize << " align " << (1 << AlignLog2); @@ -204,7 +189,7 @@ struct ByteArrayInfo { std::set Bits; uint64_t BitSize; GlobalVariable *ByteArray; - Constant *Mask; + GlobalVariable *MaskGlobal; }; /// A POD-like structure that we use to store a global reference together with @@ -241,6 +226,9 @@ class GlobalTypeMember final : TrailingObjects { class LowerTypeTestsModule { Module &M; + // This is for testing purposes only. + std::unique_ptr OwnedSummary; + bool LinkerSubsectionsViaSymbols; Triple::ArchType Arch; Triple::OSType OS; @@ -248,6 +236,7 @@ class LowerTypeTestsModule { IntegerType *Int1Ty = Type::getInt1Ty(M.getContext()); IntegerType *Int8Ty = Type::getInt8Ty(M.getContext()); + PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext()); IntegerType *Int32Ty = Type::getInt32Ty(M.getContext()); PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty); IntegerType *Int64Ty = Type::getInt64Ty(M.getContext()); @@ -259,6 +248,37 @@ class LowerTypeTestsModule { // Mapping from type identifiers to the call sites that test them. DenseMap> TypeTestCallSites; + /// This structure describes how to lower type tests for a particular type + /// identifier. It is either built directly from the global analysis (during + /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type + /// identifier summaries and external symbol references (in ThinLTO backends). + struct TypeIdLowering { + TypeTestResolution::Kind TheKind; + + /// All except Unsat: the start address within the combined global. + Constant *OffsetedGlobal; + + /// ByteArray, Inline, AllOnes: log2 of the required global alignment + /// relative to the start address. + Constant *AlignLog2; + + /// ByteArray, Inline, AllOnes: size of the memory region covering members + /// of this type identifier as a multiple of 2^AlignLog2. + Constant *Size; + + /// ByteArray, Inline, AllOnes: range of the size expressed as a bit width. + unsigned SizeBitWidth; + + /// ByteArray: the byte array to test the address against. + Constant *TheByteArray; + + /// ByteArray: the bit mask to apply to bytes loaded from the byte array. + Constant *BitMask; + + /// Inline: the bit mask to test the address against. + Constant *InlineBits; + }; + std::vector ByteArrayInfos; Function *WeakInitializerFn = nullptr; @@ -268,15 +288,13 @@ class LowerTypeTestsModule { const DenseMap &GlobalLayout); ByteArrayInfo *createByteArray(BitSetInfo &BSI); void allocateByteArrays(); - Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI, + Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL, Value *BitOffset); void lowerTypeTestCalls( ArrayRef TypeIds, Constant *CombinedGlobalAddr, const DenseMap &GlobalLayout); - Value * - lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI, - Constant *CombinedGlobal, - const DenseMap &GlobalLayout); + Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI, + const TypeIdLowering &TIL); void buildBitSetsFromGlobalVariables(ArrayRef TypeIds, ArrayRef Globals); unsigned getJumpTableEntrySize(); @@ -302,6 +320,7 @@ class LowerTypeTestsModule { public: LowerTypeTestsModule(Module &M); + ~LowerTypeTestsModule(); bool lower(); }; @@ -380,7 +399,7 @@ ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) { BAI->Bits = BSI.Bits; BAI->BitSize = BSI.BitSize; BAI->ByteArray = ByteArrayGlobal; - BAI->Mask = ConstantExpr::getPtrToInt(MaskGlobal, Int8Ty); + BAI->MaskGlobal = MaskGlobal; return BAI; } @@ -399,8 +418,9 @@ void LowerTypeTestsModule::allocateByteArrays() { uint8_t Mask; BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask); - BAI->Mask->replaceAllUsesWith(ConstantInt::get(Int8Ty, Mask)); - cast(BAI->Mask->getOperand(0))->eraseFromParent(); + BAI->MaskGlobal->replaceAllUsesWith( + ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy)); + BAI->MaskGlobal->eraseFromParent(); } Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes); @@ -435,101 +455,121 @@ void LowerTypeTestsModule::allocateByteArrays() { ByteArraySizeBytes = BAB.Bytes.size(); } -/// Build a test that bit BitOffset is set in BSI, where -/// BitSetGlobal is a global containing the bits in BSI. -Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, - ByteArrayInfo *&BAI, +/// Build a test that bit BitOffset is set in the type identifier that was +/// lowered to TIL, which must be either an Inline or a ByteArray. +Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B, + const TypeIdLowering &TIL, Value *BitOffset) { - if (BSI.BitSize <= 64) { + if (TIL.TheKind == TypeTestResolution::Inline) { // If the bit set is sufficiently small, we can avoid a load by bit testing // a constant. - IntegerType *BitsTy; - if (BSI.BitSize <= 32) - BitsTy = Int32Ty; - else - BitsTy = Int64Ty; - - uint64_t Bits = 0; - for (auto Bit : BSI.Bits) - Bits |= uint64_t(1) << Bit; - Constant *BitsConst = ConstantInt::get(BitsTy, Bits); - return createMaskedBitTest(B, BitsConst, BitOffset); + return createMaskedBitTest(B, TIL.InlineBits, BitOffset); } else { - if (!BAI) { - ++NumByteArraysCreated; - BAI = createByteArray(BSI); - } - - Constant *ByteArray = BAI->ByteArray; - Type *Ty = BAI->ByteArray->getValueType(); + Constant *ByteArray = TIL.TheByteArray; if (!LinkerSubsectionsViaSymbols && AvoidReuse) { // Each use of the byte array uses a different alias. This makes the // backend less likely to reuse previously computed byte array addresses, // improving the security of the CFI mechanism based on this pass. - ByteArray = GlobalAlias::create(BAI->ByteArray->getValueType(), 0, - GlobalValue::PrivateLinkage, "bits_use", - ByteArray, &M); + ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage, + "bits_use", ByteArray, &M); } - Value *ByteAddr = B.CreateGEP(Ty, ByteArray, BitOffset); + Value *ByteAddr = B.CreateGEP(Int8Ty, ByteArray, BitOffset); Value *Byte = B.CreateLoad(ByteAddr); - Value *ByteAndMask = B.CreateAnd(Byte, BAI->Mask); + Value *ByteAndMask = + B.CreateAnd(Byte, ConstantExpr::getPtrToInt(TIL.BitMask, Int8Ty)); return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0)); } } +static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL, + Value *V, uint64_t COffset) { + if (auto GV = dyn_cast(V)) { + SmallVector Types; + GV->getMetadata(LLVMContext::MD_type, Types); + for (MDNode *Type : Types) { + if (Type->getOperand(1) != TypeId) + continue; + uint64_t Offset = + cast( + cast(Type->getOperand(0))->getValue()) + ->getZExtValue(); + if (COffset == Offset) + return true; + } + return false; + } + + if (auto GEP = dyn_cast(V)) { + APInt APOffset(DL.getPointerSizeInBits(0), 0); + bool Result = GEP->accumulateConstantOffset(DL, APOffset); + if (!Result) + return false; + COffset += APOffset.getZExtValue(); + return isKnownTypeIdMember(TypeId, DL, GEP->getPointerOperand(), COffset); + } + + if (auto Op = dyn_cast(V)) { + if (Op->getOpcode() == Instruction::BitCast) + return isKnownTypeIdMember(TypeId, DL, Op->getOperand(0), COffset); + + if (Op->getOpcode() == Instruction::Select) + return isKnownTypeIdMember(TypeId, DL, Op->getOperand(1), COffset) && + isKnownTypeIdMember(TypeId, DL, Op->getOperand(2), COffset); + } + + return false; +} + /// Lower a llvm.type.test call to its implementation. Returns the value to /// replace the call with. -Value *LowerTypeTestsModule::lowerBitSetCall( - CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI, - Constant *CombinedGlobalIntAddr, - const DenseMap &GlobalLayout) { +Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI, + const TypeIdLowering &TIL) { + if (TIL.TheKind == TypeTestResolution::Unsat) + return ConstantInt::getFalse(M.getContext()); + Value *Ptr = CI->getArgOperand(0); const DataLayout &DL = M.getDataLayout(); - - if (BSI.containsValue(DL, GlobalLayout, Ptr)) + if (isKnownTypeIdMember(TypeId, DL, Ptr, 0)) return ConstantInt::getTrue(M.getContext()); - Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd( - CombinedGlobalIntAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)); - BasicBlock *InitialBB = CI->getParent(); IRBuilder<> B(CI); Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy); - if (BSI.isSingleOffset()) + Constant *OffsetedGlobalAsInt = + ConstantExpr::getPtrToInt(TIL.OffsetedGlobal, IntPtrTy); + if (TIL.TheKind == TypeTestResolution::Single) return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt); Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt); - Value *BitOffset; - if (BSI.AlignLog2 == 0) { - BitOffset = PtrOffset; - } else { - // We need to check that the offset both falls within our range and is - // suitably aligned. We can check both properties at the same time by - // performing a right rotate by log2(alignment) followed by an integer - // comparison against the bitset size. The rotate will move the lower - // order bits that need to be zero into the higher order bits of the - // result, causing the comparison to fail if they are nonzero. The rotate - // also conveniently gives us a bit offset to use during the load from - // the bitset. - Value *OffsetSHR = - B.CreateLShr(PtrOffset, ConstantInt::get(IntPtrTy, BSI.AlignLog2)); - Value *OffsetSHL = B.CreateShl( - PtrOffset, - ConstantInt::get(IntPtrTy, DL.getPointerSizeInBits(0) - BSI.AlignLog2)); - BitOffset = B.CreateOr(OffsetSHR, OffsetSHL); - } - - Constant *BitSizeConst = ConstantInt::get(IntPtrTy, BSI.BitSize); + // We need to check that the offset both falls within our range and is + // suitably aligned. We can check both properties at the same time by + // performing a right rotate by log2(alignment) followed by an integer + // comparison against the bitset size. The rotate will move the lower + // order bits that need to be zero into the higher order bits of the + // result, causing the comparison to fail if they are nonzero. The rotate + // also conveniently gives us a bit offset to use during the load from + // the bitset. + Value *OffsetSHR = + B.CreateLShr(PtrOffset, ConstantExpr::getZExt(TIL.AlignLog2, IntPtrTy)); + Value *OffsetSHL = B.CreateShl( + PtrOffset, ConstantExpr::getZExt( + ConstantExpr::getSub( + ConstantInt::get(Int8Ty, DL.getPointerSizeInBits(0)), + TIL.AlignLog2), + IntPtrTy)); + Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL); + + Constant *BitSizeConst = ConstantExpr::getZExt(TIL.Size, IntPtrTy); Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst); // If the bit set is all ones, testing against it is unnecessary. - if (BSI.isAllOnes()) + if (TIL.TheKind == TypeTestResolution::AllOnes) return OffsetInRange; TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false); @@ -537,7 +577,7 @@ Value *LowerTypeTestsModule::lowerBitSetCall( // Now that we know that the offset is in range and aligned, load the // appropriate bit from the bitset. - Value *Bit = createBitSetTest(ThenB, BSI, BAI, BitOffset); + Value *Bit = createBitSetTest(ThenB, TIL, BitOffset); // The value we want is 0 if we came directly from the initial block // (having failed the range or alignment checks), or the loaded bit if @@ -622,11 +662,7 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables( void LowerTypeTestsModule::lowerTypeTestCalls( ArrayRef TypeIds, Constant *CombinedGlobalAddr, const DenseMap &GlobalLayout) { - Constant *CombinedGlobalIntAddr = - ConstantExpr::getPtrToInt(CombinedGlobalAddr, IntPtrTy); - DenseMap GlobalObjLayout; - for (auto &P : GlobalLayout) - GlobalObjLayout[P.first->getGlobal()] = P.second; + CombinedGlobalAddr = ConstantExpr::getBitCast(CombinedGlobalAddr, Int8PtrTy); // For each type identifier in this disjoint set... for (Metadata *TypeId : TypeIds) { @@ -640,13 +676,43 @@ void LowerTypeTestsModule::lowerTypeTestCalls( BSI.print(dbgs()); }); - ByteArrayInfo *BAI = nullptr; + TypeIdLowering TIL; + TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr( + Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)), + TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2); + if (BSI.isAllOnes()) { + TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single + : TypeTestResolution::AllOnes; + TIL.SizeBitWidth = (BSI.BitSize <= 256) ? 8 : 32; + TIL.Size = ConstantInt::get((BSI.BitSize <= 256) ? Int8Ty : Int32Ty, + BSI.BitSize); + } else if (BSI.BitSize <= 64) { + TIL.TheKind = TypeTestResolution::Inline; + TIL.SizeBitWidth = (BSI.BitSize <= 32) ? 5 : 6; + TIL.Size = ConstantInt::get(Int8Ty, BSI.BitSize); + uint64_t InlineBits = 0; + for (auto Bit : BSI.Bits) + InlineBits |= uint64_t(1) << Bit; + if (InlineBits == 0) + TIL.TheKind = TypeTestResolution::Unsat; + else + TIL.InlineBits = ConstantInt::get( + (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits); + } else { + TIL.TheKind = TypeTestResolution::ByteArray; + TIL.SizeBitWidth = (BSI.BitSize <= 256) ? 8 : 32; + TIL.Size = ConstantInt::get((BSI.BitSize <= 256) ? Int8Ty : Int32Ty, + BSI.BitSize); + ++NumByteArraysCreated; + ByteArrayInfo *BAI = createByteArray(BSI); + TIL.TheByteArray = BAI->ByteArray; + TIL.BitMask = BAI->MaskGlobal; + } // Lower each call to llvm.type.test for this type identifier. for (CallInst *CI : TypeTestCallSites[TypeId]) { ++NumTypeTestCallsLowered; - Value *Lowered = - lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalObjLayout); + Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL); CI->replaceAllUsesWith(Lowered); CI->eraseFromParent(); } @@ -1080,6 +1146,22 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet( /// Lower all type tests in this module. LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) { + // Handle the command-line summary arguments. This code is for testing + // purposes only, so we handle errors directly. + if (!ClSummaryAction.empty()) { + OwnedSummary = make_unique(); + if (!ClReadSummary.empty()) { + ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary + + ": "); + auto ReadSummaryFile = + ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary))); + + yaml::Input In(ReadSummaryFile->getBuffer()); + In >> *OwnedSummary; + ExitOnErr(errorCodeToError(In.error())); + } + } + Triple TargetTriple(M.getTargetTriple()); LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX(); Arch = TargetTriple.getArch(); @@ -1087,6 +1169,20 @@ LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) { ObjectFormat = TargetTriple.getObjectFormat(); } +LowerTypeTestsModule::~LowerTypeTestsModule() { + if (ClSummaryAction.empty() || ClWriteSummary.empty()) + return; + + ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary + + ": "); + std::error_code EC; + raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text); + ExitOnErr(errorCodeToError(EC)); + + yaml::Output Out(OS); + Out << *OwnedSummary; +} + bool LowerTypeTestsModule::lower() { Function *TypeTestFunc = M.getFunction(Intrinsic::getName(Intrinsic::type_test)); diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index f863d192fc2..b29ed3c8745 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1637,6 +1637,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::cos: + case Intrinsic::amdgcn_cos: { + Value *SrcSrc; + Value *Src = II->getArgOperand(0); + if (match(Src, m_FNeg(m_Value(SrcSrc))) || + match(Src, m_Intrinsic(m_Value(SrcSrc)))) { + // cos(-x) -> cos(x) + // cos(fabs(x)) -> cos(x) + II->setArgOperand(0, SrcSrc); + return II; + } + + break; + } case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: // Turn PPC lvx -> load if the pointer is known aligned. diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 6a7cb0e45c6..1d552839877 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -514,7 +514,8 @@ struct AddressSanitizer : public FunctionPass { void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize, bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp); - void instrumentUnusualSizeOrAlignment(Instruction *I, Value *Addr, + void instrumentUnusualSizeOrAlignment(Instruction *I, + Instruction *InsertBefore, Value *Addr, uint32_t TypeSize, bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp); @@ -1056,20 +1057,18 @@ Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I, return nullptr; *IsWrite = false; } - // Only instrument if the mask is constant for now. - if (isa(CI->getOperand(2 + OpOffset))) { - auto BasePtr = CI->getOperand(0 + OpOffset); - auto Ty = cast(BasePtr->getType())->getElementType(); - *TypeSize = DL.getTypeStoreSizeInBits(Ty); - if (auto AlignmentConstant = - dyn_cast(CI->getOperand(1 + OpOffset))) - *Alignment = (unsigned)AlignmentConstant->getZExtValue(); - else - *Alignment = 1; // No alignment guarantees. We probably got Undef - if (MaybeMask) - *MaybeMask = CI->getOperand(2 + OpOffset); - PtrOperand = BasePtr; - } + + auto BasePtr = CI->getOperand(0 + OpOffset); + auto Ty = cast(BasePtr->getType())->getElementType(); + *TypeSize = DL.getTypeStoreSizeInBits(Ty); + if (auto AlignmentConstant = + dyn_cast(CI->getOperand(1 + OpOffset))) + *Alignment = (unsigned)AlignmentConstant->getZExtValue(); + else + *Alignment = 1; // No alignment guarantees. We probably got Undef + if (MaybeMask) + *MaybeMask = CI->getOperand(2 + OpOffset); + PtrOperand = BasePtr; } } @@ -1130,24 +1129,25 @@ void AddressSanitizer::instrumentPointerComparisonOrSubtraction( } static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I, - Value *Addr, unsigned Alignment, - unsigned Granularity, uint32_t TypeSize, - bool IsWrite, Value *SizeArgument, - bool UseCalls, uint32_t Exp) { + Instruction *InsertBefore, Value *Addr, + unsigned Alignment, unsigned Granularity, + uint32_t TypeSize, bool IsWrite, + Value *SizeArgument, bool UseCalls, + uint32_t Exp) { // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check // if the data is properly aligned. if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 || TypeSize == 128) && (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8)) - return Pass->instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr, - UseCalls, Exp); - Pass->instrumentUnusualSizeOrAlignment(I, Addr, TypeSize, IsWrite, nullptr, - UseCalls, Exp); + return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite, + nullptr, UseCalls, Exp); + Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize, + IsWrite, nullptr, UseCalls, Exp); } static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass, const DataLayout &DL, Type *IntptrTy, - ConstantVector *Mask, Instruction *I, + Value *Mask, Instruction *I, Value *Addr, unsigned Alignment, unsigned Granularity, uint32_t TypeSize, bool IsWrite, Value *SizeArgument, @@ -1157,15 +1157,30 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass, unsigned Num = VTy->getVectorNumElements(); auto Zero = ConstantInt::get(IntptrTy, 0); for (unsigned Idx = 0; Idx < Num; ++Idx) { - // dyn_cast as we might get UndefValue - auto Masked = dyn_cast(Mask->getOperand(Idx)); - if (Masked && Masked->isAllOnesValue()) { + Value *InstrumentedAddress = nullptr; + Instruction *InsertBefore = I; + if (auto *Vector = dyn_cast(Mask)) { + // dyn_cast as we might get UndefValue + if (auto *Masked = dyn_cast(Vector->getOperand(Idx))) { + if (Masked->isNullValue()) + // Mask is constant false, so no instrumentation needed. + continue; + // If we have a true or undef value, fall through to doInstrumentAddress + // with InsertBefore == I + } + } else { IRBuilder<> IRB(I); - auto InstrumentedAddress = - IRB.CreateGEP(Addr, {Zero, ConstantInt::get(IntptrTy, Idx)}); - doInstrumentAddress(Pass, I, InstrumentedAddress, Alignment, Granularity, - ElemTypeSize, IsWrite, SizeArgument, UseCalls, Exp); + Value *MaskElem = IRB.CreateExtractElement(Mask, Idx); + TerminatorInst *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false); + InsertBefore = ThenTerm; } + + IRBuilder<> IRB(InsertBefore); + InstrumentedAddress = + IRB.CreateGEP(Addr, {Zero, ConstantInt::get(IntptrTy, Idx)}); + doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment, + Granularity, ElemTypeSize, IsWrite, SizeArgument, + UseCalls, Exp); } } @@ -1220,12 +1235,11 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, unsigned Granularity = 1 << Mapping.Scale; if (MaybeMask) { - auto Mask = cast(MaybeMask); - instrumentMaskedLoadOrStore(this, DL, IntptrTy, Mask, I, Addr, Alignment, - Granularity, TypeSize, IsWrite, nullptr, - UseCalls, Exp); + instrumentMaskedLoadOrStore(this, DL, IntptrTy, MaybeMask, I, Addr, + Alignment, Granularity, TypeSize, IsWrite, + nullptr, UseCalls, Exp); } else { - doInstrumentAddress(this, I, Addr, Alignment, Granularity, TypeSize, + doInstrumentAddress(this, I, I, Addr, Alignment, Granularity, TypeSize, IsWrite, nullptr, UseCalls, Exp); } } @@ -1342,9 +1356,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able // to report the actual access size. void AddressSanitizer::instrumentUnusualSizeOrAlignment( - Instruction *I, Value *Addr, uint32_t TypeSize, bool IsWrite, - Value *SizeArgument, bool UseCalls, uint32_t Exp) { - IRBuilder<> IRB(I); + Instruction *I, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize, + bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) { + IRBuilder<> IRB(InsertBefore); Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8); Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); if (UseCalls) { @@ -1358,8 +1372,8 @@ void AddressSanitizer::instrumentUnusualSizeOrAlignment( Value *LastByte = IRB.CreateIntToPtr( IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)), Addr->getType()); - instrumentAddress(I, I, Addr, 8, IsWrite, Size, false, Exp); - instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false, Exp); + instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp); + instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp); } } diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 9485bfd7c29..0137378b828 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -1572,6 +1572,13 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // Assign value numbers to the new instructions. for (Instruction *I : NewInsts) { + // Instructions that have been inserted in predecessor(s) to materialize + // the load address do not retain their original debug locations. Doing + // so could lead to confusing (but correct) source attributions. + // FIXME: How do we retain source locations without causing poor debugging + // behavior? + I->setDebugLoc(DebugLoc()); + // FIXME: We really _ought_ to insert these value numbers into their // parent's availability map. However, in doing so, we risk getting into // ordering issues. If a block hasn't been processed yet, we would be @@ -1601,8 +1608,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) NewLoad->setMetadata(LLVMContext::MD_range, RangeMD); - // Transfer DebugLoc. - NewLoad->setDebugLoc(LI->getDebugLoc()); + // We do not propagate the old load's debug location, because the new + // load now lives in a different BB, and we want to avoid a jumpy line + // table. + // FIXME: How do we retain source locations without causing poor debugging + // behavior? // Add the newly created load. ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred, diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 1cc5c8f0da8..6ef9d056132 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -408,6 +408,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, CurAST->deleteValue(&I); I.eraseFromParent(); } + Changed = true; continue; } @@ -766,6 +767,14 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, // Move the new node to the Preheader, before its terminator. I.moveBefore(Preheader->getTerminator()); + // Do not retain debug locations when we are moving instructions to different + // basic blocks, because we want to avoid jumpy line tables. Calls, however, + // need to retain their debug locs because they may be inlined. + // FIXME: How do we retain source locations without causing poor debugging + // behavior? + if (!isa(I)) + I.setDebugLoc(DebugLoc()); + if (isa(I)) ++NumMovedLoads; else if (isa(I)) @@ -911,14 +920,23 @@ bool llvm::promoteLoopAccessesToScalars( // // If at least one store is guaranteed to execute, both properties are // satisfied, and promotion is legal. + // // This, however, is not a necessary condition. Even if no store/load is - // guaranteed to execute, we can still establish these properties: - // (p1) by proving that hoisting the load into the preheader is - // safe (i.e. proving dereferenceability on all paths through the loop). We + // guaranteed to execute, we can still establish these properties. + // We can establish (p1) by proving that hoisting the load into the preheader + // is safe (i.e. proving dereferenceability on all paths through the loop). We // can use any access within the alias set to prove dereferenceability, // since they're all must alias. - // (p2) by proving the memory is thread-local, so the memory model + // + // There are two ways establish (p2): + // a) Prove the location is thread-local. In this case the memory model // requirement does not apply, and stores are safe to insert. + // b) Prove a store dominates every exit block. In this case, if an exit + // blocks is reached, the original dynamic path would have taken us through + // the store, so inserting a store into the exit block is safe. Note that this + // is different from the store being guaranteed to execute. For instance, + // if an exception is thrown on the first iteration of the loop, the original + // store is never executed, but the exit blocks are not executed either. bool DereferenceableInPH = false; bool SafeToInsertStore = false; @@ -1000,6 +1018,17 @@ bool llvm::promoteLoopAccessesToScalars( } } + // If a store dominates all exit blocks, it is safe to sink. + // As explained above, if an exit block was executed, a dominating + // store must have been been executed at least once, so we are not + // introducing stores on paths that did not have them. + // Note that this only looks at explicit exit blocks. If we ever + // start sinking stores into unwind edges (see above), this will break. + if (!SafeToInsertStore) + SafeToInsertStore = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) { + return DT->dominates(Store->getParent(), Exit); + }); + // If the store is not guaranteed to execute, we may still get // deref info through it. if (!DereferenceableInPH) { diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index fd167db1178..2743574ecca 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -997,7 +997,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() { /// Check if the given conditional branch is based on the comparison between /// a variable and zero, and if the variable is non-zero, the control yields to /// the loop entry. If the branch matches the behavior, the variable involved -/// in the comparion is returned. This function will be called to see if the +/// in the comparison is returned. This function will be called to see if the /// precondition and postcondition of the loop are in desirable form. static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { if (!BI || !BI->isConditional()) diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp index 90309d7ebba..f6435449777 100644 --- a/lib/Transforms/Scalar/LoopSink.cpp +++ b/lib/Transforms/Scalar/LoopSink.cpp @@ -283,8 +283,7 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, // sinked. for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) { Instruction *I = &*II++; - if (!L.hasLoopInvariantOperands(I) || - !canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr)) + if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr)) continue; if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI)) Changed = true; diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp index 440e36767ed..678d02e05d4 100644 --- a/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -56,12 +56,9 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal( if (!isPerformingImport() && !isModuleExporting()) return false; - // If we are exporting, we need to see whether this value is marked - // as NoRename in the summary. If we are importing, we may not have - // a summary in the distributed backend case (only summaries for values - // importes as defs, not references, are included in the index passed - // to the distributed backends). if (isPerformingImport()) { + assert((!GlobalsToImport->count(SGV) || !isNonRenamableLocal(*SGV)) && + "Attempting to promote non-renamable local"); // We don't know for sure yet if we are importing this value (as either // a reference or a def), since we are simply walking all values in the // module. But by necessity if we end up importing it and it is local, @@ -77,13 +74,28 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal( assert(Summaries->second.size() == 1 && "Local has more than one summary"); auto Linkage = Summaries->second.front()->linkage(); if (!GlobalValue::isLocalLinkage(Linkage)) { - assert(!Summaries->second.front()->noRename()); + assert(!isNonRenamableLocal(*SGV) && + "Attempting to promote non-renamable local"); return true; } return false; } +#ifndef NDEBUG +bool FunctionImportGlobalProcessing::isNonRenamableLocal( + const GlobalValue &GV) const { + if (!GV.hasLocalLinkage()) + return false; + // This needs to stay in sync with the logic in buildModuleSummaryIndex. + if (GV.hasSection()) + return true; + if (Used.count(const_cast(&GV))) + return true; + return false; +} +#endif + std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV, bool DoPromote) { // For locals that must be promoted to global scope, ensure that diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 8cde0c4cd60..31daba2248a 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6785,22 +6785,19 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) { return Cost; } -/// \brief Check whether the address computation for a non-consecutive memory -/// access looks like an unlikely candidate for being merged into the indexing -/// mode. +/// \brief Gets Address Access SCEV after verifying that the access pattern +/// is loop invariant except the induction variable dependence. /// -/// We look for a GEP which has one index that is an induction variable and all -/// other indices are loop invariant. If the stride of this access is also -/// within a small bound we decide that this address computation can likely be -/// merged into the addressing mode. -/// In all other cases, we identify the address computation as complex. -static bool isLikelyComplexAddressComputation(Value *Ptr, - LoopVectorizationLegality *Legal, - ScalarEvolution *SE, - const Loop *TheLoop) { +/// This SCEV can be sent to the Target in order to estimate the address +/// calculation cost. +static const SCEV *getAddressAccessSCEV( + Value *Ptr, + LoopVectorizationLegality *Legal, + ScalarEvolution *SE, + const Loop *TheLoop) { auto *Gep = dyn_cast(Ptr); if (!Gep) - return true; + return nullptr; // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. @@ -6809,33 +6806,11 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, Value *Opd = Gep->getOperand(i); if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && !Legal->isInductionVariable(Opd)) - return true; + return nullptr; } - // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step - // can likely be merged into the address computation. - unsigned MaxMergeDistance = 64; - - const SCEVAddRecExpr *AddRec = dyn_cast(SE->getSCEV(Ptr)); - if (!AddRec) - return true; - - // Check the step is constant. - const SCEV *Step = AddRec->getStepRecurrence(*SE); - // Calculate the pointer stride and check if it is consecutive. - const auto *C = dyn_cast(Step); - if (!C) - return true; - - const APInt &APStepVal = C->getAPInt(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return true; - - int64_t StepVal = APStepVal.getSExtValue(); - - return StepVal > MaxMergeDistance; + // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. + return SE->getSCEV(Ptr); } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { @@ -7063,12 +7038,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned Cost = 0; Type *PtrTy = ToVectorTy(Ptr->getType(), VF); - // True if the memory instruction's address computation is complex. - bool IsComplexComputation = - isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); + // Figure out whether the access is strided and get the stride value + // if it's known in compile time + const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); + Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS); diff --git a/test/Analysis/CostModel/AArch64/bswap.ll b/test/Analysis/CostModel/AArch64/bswap.ll new file mode 100644 index 00000000000..a97127a631d --- /dev/null +++ b/test/Analysis/CostModel/AArch64/bswap.ll @@ -0,0 +1,70 @@ +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu < %s | FileCheck %s + +; Verify the cost of bswap instructions. + +declare i16 @llvm.bswap.i16(i16) +declare i32 @llvm.bswap.i32(i32) +declare i64 @llvm.bswap.i64(i64) + +declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) +declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) + +declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) +declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) +declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) + +define i16 @bswap_i16(i16 %a) { +; CHECK: 'Cost Model Analysis' for function 'bswap_i16': +; CHECK: Found an estimated cost of 1 for instruction: %bswap + %bswap = tail call i16 @llvm.bswap.i16(i16 %a) + ret i16 %bswap +} + +define i32 @bswap_i32(i32 %a) { +; CHECK: 'Cost Model Analysis' for function 'bswap_i32': +; CHECK: Found an estimated cost of 1 for instruction: %bswap + %bswap = tail call i32 @llvm.bswap.i32(i32 %a) + ret i32 %bswap +} + +define i64 @bswap_i64(i64 %a) { +; CHECK: 'Cost Model Analysis' for function 'bswap_i64': +; CHECK: Found an estimated cost of 1 for instruction: %bswap + %bswap = tail call i64 @llvm.bswap.i64(i64 %a) + ret i64 %bswap +} + +define <2 x i32> @bswap_v2i32(<2 x i32> %a) { +; CHECK: 'Cost Model Analysis' for function 'bswap_v2i32': +; CHECK: Found an estimated cost of 8 for instruction: %bswap + %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) + ret <2 x i32> %bswap +} + +define <4 x i16> @bswap_v4i16(<4 x i16> %a) { +; CHECK: 'Cost Model Analysis' for function 'bswap_v4i16': +; CHECK: Found an estimated cost of 22 for instruction: %bswap + %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %a) + ret <4 x i16> %bswap +} + +define <2 x i64> @bswap_v2i64(<2 x i64> %a) { +; CHECK: 'Cost Model Analysis' for function 'bswap_v2i64': +; CHECK: Found an estimated cost of 8 for instruction: %bswap + %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a) + ret <2 x i64> %bswap +} + +define <4 x i32> @bswap_v4i32(<4 x i32> %a) { +; CHECK: 'Cost Model Analysis' for function 'bswap_v4i32': +; CHECK: Found an estimated cost of 22 for instruction: %bswap + %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a) + ret <4 x i32> %bswap +} + +define <8 x i16> @bswap_v8i16(<8 x i16> %a) { +; CHECK: 'Cost Model Analysis' for function 'bswap_v8i16': +; CHECK: Found an estimated cost of 50 for instruction: %bswap + %bswap = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a) + ret <8 x i16> %bswap +} diff --git a/test/Analysis/CostModel/AArch64/falkor.ll b/test/Analysis/CostModel/AArch64/falkor.ll new file mode 100644 index 00000000000..e9563191f07 --- /dev/null +++ b/test/Analysis/CostModel/AArch64/falkor.ll @@ -0,0 +1,26 @@ +; RUN: opt < %s -cost-model -analyze -mcpu=falkor | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: vectorInstrCost +define void @vectorInstrCost() { + + ; Vector extracts - extracting the first element should have a zero cost; + ; all other elements should have a cost of two. + ; + ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0 + ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1 + %t1 = extractelement <2 x i64> undef, i32 0 + %t2 = extractelement <2 x i64> undef, i32 1 + + ; Vector inserts - inserting the first element should have a zero cost; all + ; other elements should have a cost of two. + ; + ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0 + ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1 + %t3 = insertelement <2 x i64> undef, i64 undef, i32 0 + %t4 = insertelement <2 x i64> undef, i64 undef, i32 1 + + ret void +} diff --git a/test/Analysis/CostModel/AArch64/gep.ll b/test/Analysis/CostModel/AArch64/gep.ll index f3d83c13302..08bfc3d2123 100644 --- a/test/Analysis/CostModel/AArch64/gep.ll +++ b/test/Analysis/CostModel/AArch64/gep.ll @@ -1,9 +1,9 @@ -; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck %s +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" -define i8 @test1(i8* %p, i32 %i) { +define i8 @test1(i8* %p) { ; CHECK-LABEL: test1 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* %a = getelementptr inbounds i8, i8* %p, i32 0 @@ -11,7 +11,7 @@ define i8 @test1(i8* %p, i32 %i) { ret i8 %v } -define i16 @test2(i16* %p, i32 %i) { +define i16 @test2(i16* %p) { ; CHECK-LABEL: test2 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16* %a = getelementptr inbounds i16, i16* %p, i32 0 @@ -19,7 +19,7 @@ define i16 @test2(i16* %p, i32 %i) { ret i16 %v } -define i32 @test3(i32* %p, i32 %i) { +define i32 @test3(i32* %p) { ; CHECK-LABEL: test3 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32* %a = getelementptr inbounds i32, i32* %p, i32 0 @@ -27,7 +27,7 @@ define i32 @test3(i32* %p, i32 %i) { ret i32 %v } -define i64 @test4(i64* %p, i32 %i) { +define i64 @test4(i64* %p) { ; CHECK-LABEL: test4 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64* %a = getelementptr inbounds i64, i64* %p, i32 0 @@ -35,7 +35,7 @@ define i64 @test4(i64* %p, i32 %i) { ret i64 %v } -define i8 @test5(i8* %p, i32 %i) { +define i8 @test5(i8* %p) { ; CHECK-LABEL: test5 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* %a = getelementptr inbounds i8, i8* %p, i32 1024 @@ -43,7 +43,7 @@ define i8 @test5(i8* %p, i32 %i) { ret i8 %v } -define i16 @test6(i16* %p, i32 %i) { +define i16 @test6(i16* %p) { ; CHECK-LABEL: test6 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16* %a = getelementptr inbounds i16, i16* %p, i32 1024 @@ -51,7 +51,7 @@ define i16 @test6(i16* %p, i32 %i) { ret i16 %v } -define i32 @test7(i32* %p, i32 %i) { +define i32 @test7(i32* %p) { ; CHECK-LABEL: test7 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32* %a = getelementptr inbounds i32, i32* %p, i32 1024 @@ -59,7 +59,7 @@ define i32 @test7(i32* %p, i32 %i) { ret i32 %v } -define i64 @test8(i64* %p, i32 %i) { +define i64 @test8(i64* %p) { ; CHECK-LABEL: test8 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64* %a = getelementptr inbounds i64, i64* %p, i32 1024 @@ -67,7 +67,7 @@ define i64 @test8(i64* %p, i32 %i) { ret i64 %v } -define i8 @test9(i8* %p, i32 %i) { +define i8 @test9(i8* %p) { ; CHECK-LABEL: test9 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8* %a = getelementptr inbounds i8, i8* %p, i32 4096 @@ -75,7 +75,7 @@ define i8 @test9(i8* %p, i32 %i) { ret i8 %v } -define i16 @test10(i16* %p, i32 %i) { +define i16 @test10(i16* %p) { ; CHECK-LABEL: test10 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16* %a = getelementptr inbounds i16, i16* %p, i32 4096 @@ -83,7 +83,7 @@ define i16 @test10(i16* %p, i32 %i) { ret i16 %v } -define i32 @test11(i32* %p, i32 %i) { +define i32 @test11(i32* %p) { ; CHECK-LABEL: test11 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32* %a = getelementptr inbounds i32, i32* %p, i32 4096 @@ -91,7 +91,7 @@ define i32 @test11(i32* %p, i32 %i) { ret i32 %v } -define i64 @test12(i64* %p, i32 %i) { +define i64 @test12(i64* %p) { ; CHECK-LABEL: test12 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64* %a = getelementptr inbounds i64, i64* %p, i32 4096 @@ -99,7 +99,7 @@ define i64 @test12(i64* %p, i32 %i) { ret i64 %v } -define i8 @test13(i8* %p, i32 %i) { +define i8 @test13(i8* %p) { ; CHECK-LABEL: test13 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* %a = getelementptr inbounds i8, i8* %p, i32 -64 @@ -107,7 +107,7 @@ define i8 @test13(i8* %p, i32 %i) { ret i8 %v } -define i16 @test14(i16* %p, i32 %i) { +define i16 @test14(i16* %p) { ; CHECK-LABEL: test14 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16* %a = getelementptr inbounds i16, i16* %p, i32 -64 @@ -115,7 +115,7 @@ define i16 @test14(i16* %p, i32 %i) { ret i16 %v } -define i32 @test15(i32* %p, i32 %i) { +define i32 @test15(i32* %p) { ; CHECK-LABEL: test15 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32* %a = getelementptr inbounds i32, i32* %p, i32 -64 @@ -123,7 +123,7 @@ define i32 @test15(i32* %p, i32 %i) { ret i32 %v } -define i64 @test16(i64* %p, i32 %i) { +define i64 @test16(i64* %p) { ; CHECK-LABEL: test16 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64* %a = getelementptr inbounds i64, i64* %p, i32 -64 @@ -131,7 +131,7 @@ define i64 @test16(i64* %p, i32 %i) { ret i64 %v } -define i8 @test17(i8* %p, i32 %i) { +define i8 @test17(i8* %p) { ; CHECK-LABEL: test17 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8* %a = getelementptr inbounds i8, i8* %p, i32 -1024 @@ -139,7 +139,7 @@ define i8 @test17(i8* %p, i32 %i) { ret i8 %v } -define i16 @test18(i16* %p, i32 %i) { +define i16 @test18(i16* %p) { ; CHECK-LABEL: test18 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16* %a = getelementptr inbounds i16, i16* %p, i32 -1024 @@ -147,7 +147,7 @@ define i16 @test18(i16* %p, i32 %i) { ret i16 %v } -define i32 @test19(i32* %p, i32 %i) { +define i32 @test19(i32* %p) { ; CHECK-LABEL: test19 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32* %a = getelementptr inbounds i32, i32* %p, i32 -1024 @@ -155,7 +155,7 @@ define i32 @test19(i32* %p, i32 %i) { ret i32 %v } -define i64 @test20(i64* %p, i32 %i) { +define i64 @test20(i64* %p) { ; CHECK-LABEL: test20 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64* %a = getelementptr inbounds i64, i64* %p, i32 -1024 @@ -195,7 +195,7 @@ define i64 @test24(i64* %p, i32 %i) { ret i64 %v } -define i8 @test25(i8* %p, i32 %i) { +define i8 @test25(i8* %p) { ; CHECK-LABEL: test25 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* %a = getelementptr inbounds i8, i8* %p, i32 -128 @@ -203,7 +203,7 @@ define i8 @test25(i8* %p, i32 %i) { ret i8 %v } -define i16 @test26(i16* %p, i32 %i) { +define i16 @test26(i16* %p) { ; CHECK-LABEL: test26 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16* %a = getelementptr inbounds i16, i16* %p, i32 -128 @@ -211,7 +211,7 @@ define i16 @test26(i16* %p, i32 %i) { ret i16 %v } -define i32 @test27(i32* %p, i32 %i) { +define i32 @test27(i32* %p) { ; CHECK-LABEL: test27 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32* %a = getelementptr inbounds i32, i32* %p, i32 -128 @@ -219,7 +219,7 @@ define i32 @test27(i32* %p, i32 %i) { ret i32 %v } -define i64 @test28(i64* %p, i32 %i) { +define i64 @test28(i64* %p) { ; CHECK-LABEL: test28 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64* %a = getelementptr inbounds i64, i64* %p, i32 -128 @@ -227,7 +227,7 @@ define i64 @test28(i64* %p, i32 %i) { ret i64 %v } -define i8 @test29(i8* %p, i32 %i) { +define i8 @test29(i8* %p) { ; CHECK-LABEL: test29 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8* %a = getelementptr inbounds i8, i8* %p, i32 -256 @@ -235,7 +235,7 @@ define i8 @test29(i8* %p, i32 %i) { ret i8 %v } -define i16 @test30(i16* %p, i32 %i) { +define i16 @test30(i16* %p) { ; CHECK-LABEL: test30 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16* %a = getelementptr inbounds i16, i16* %p, i32 -256 @@ -243,7 +243,7 @@ define i16 @test30(i16* %p, i32 %i) { ret i16 %v } -define i32 @test31(i32* %p, i32 %i) { +define i32 @test31(i32* %p) { ; CHECK-LABEL: test31 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32* %a = getelementptr inbounds i32, i32* %p, i32 -256 @@ -251,7 +251,7 @@ define i32 @test31(i32* %p, i32 %i) { ret i32 %v } -define i64 @test32(i64* %p, i32 %i) { +define i64 @test32(i64* %p) { ; CHECK-LABEL: test32 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64* %a = getelementptr inbounds i64, i64* %p, i32 -256 @@ -259,7 +259,7 @@ define i64 @test32(i64* %p, i32 %i) { ret i64 %v } -define i8 @test33(i8* %p, i32 %i) { +define i8 @test33(i8* %p) { ; CHECK-LABEL: test33 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8* %a = getelementptr inbounds i8, i8* %p, i32 -512 @@ -267,7 +267,7 @@ define i8 @test33(i8* %p, i32 %i) { ret i8 %v } -define i16 @test34(i16* %p, i32 %i) { +define i16 @test34(i16* %p) { ; CHECK-LABEL: test34 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16* %a = getelementptr inbounds i16, i16* %p, i32 -512 @@ -275,7 +275,7 @@ define i16 @test34(i16* %p, i32 %i) { ret i16 %v } -define i32 @test35(i32* %p, i32 %i) { +define i32 @test35(i32* %p) { ; CHECK-LABEL: test35 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32* %a = getelementptr inbounds i32, i32* %p, i32 -512 @@ -283,7 +283,7 @@ define i32 @test35(i32* %p, i32 %i) { ret i32 %v } -define i64 @test36(i64* %p, i32 %i) { +define i64 @test36(i64* %p) { ; CHECK-LABEL: test36 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64* %a = getelementptr inbounds i64, i64* %p, i32 -512 diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll index 7319efb413d..b7a615f55cd 100644 --- a/test/Analysis/CostModel/X86/arith.ll +++ b/test/Analysis/CostModel/X86/arith.ll @@ -436,7 +436,7 @@ define i32 @mul(i32 %arg) { %A = mul <2 x i64> undef, undef ; SSSE3: cost of 16 {{.*}} %B = mul ; SSE42: cost of 16 {{.*}} %B = mul - ; AVX: cost of 16 {{.*}} %B = mul + ; AVX: cost of 18 {{.*}} %B = mul ; AVX2: cost of 8 {{.*}} %B = mul ; AVX512F: cost of 8 {{.*}} %B = mul ; AVX512BW: cost of 8 {{.*}} %B = mul @@ -444,7 +444,7 @@ define i32 @mul(i32 %arg) { %B = mul <4 x i64> undef, undef ; SSSE3: cost of 32 {{.*}} %C = mul ; SSE42: cost of 32 {{.*}} %C = mul - ; AVX: cost of 32 {{.*}} %C = mul + ; AVX: cost of 36 {{.*}} %C = mul ; AVX2: cost of 16 {{.*}} %C = mul ; AVX512F: cost of 8 {{.*}} %C = mul ; AVX512BW: cost of 8 {{.*}} %C = mul diff --git a/test/Analysis/CostModel/X86/shuffle-broadcast.ll b/test/Analysis/CostModel/X86/shuffle-broadcast.ll index a829a47f89f..86cf7569a72 100644 --- a/test/Analysis/CostModel/X86/shuffle-broadcast.ll +++ b/test/Analysis/CostModel/X86/shuffle-broadcast.ll @@ -18,14 +18,150 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer ; SSE: cost of 1 {{.*}} %V256 = shufflevector - ; AVX: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer ; SSE: cost of 1 {{.*}} %V512 = shufflevector - ; AVX: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer ret void } + +; CHECK-LABEL: 'test_vXi64' +define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) { + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXf32' +define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) { + ; SSE: cost of 1 {{.*}} %V64 = shufflevector + ; AVX: cost of 1 {{.*}} %V64 = shufflevector + ; AVX512: cost of 1 {{.*}} %V64 = shufflevector + %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi32' +define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) { + ; SSE: cost of 1 {{.*}} %V64 = shufflevector + ; AVX: cost of 1 {{.*}} %V64 = shufflevector + ; AVX512: cost of 1 {{.*}} %V64 = shufflevector + %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi16' +define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) { + ; SSE2: cost of 2 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer + + ; SSE2: cost of 2 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector + ; SSE42: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 3 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer + + ; SSE2: cost of 2 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector + ; SSE42: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 3 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi8' +define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { + ; SSE2: cost of 3 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer + + ; SSE2: cost of 3 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector + ; SSE42: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer + + ; SSE2: cost of 3 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector + ; SSE42: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer + + ret void +} diff --git a/test/Analysis/CostModel/X86/vdiv-cost.ll b/test/Analysis/CostModel/X86/vdiv-cost.ll index c8e4557cbef..a45bb4b3d0d 100644 --- a/test/Analysis/CostModel/X86/vdiv-cost.ll +++ b/test/Analysis/CostModel/X86/vdiv-cost.ll @@ -1,13 +1,20 @@ -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSSE3 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ define <4 x i32> @test1(<4 x i32> %a) { %div = udiv <4 x i32> %a, ret <4 x i32> %div ; CHECK: 'Cost Model Analysis' for function 'test1': -; SSE2: Found an estimated cost of 15 for instruction: %div -; AVX2: Found an estimated cost of 15 for instruction: %div +; SSE: Found an estimated cost of 15 for instruction: %div +; AVX: Found an estimated cost of 15 for instruction: %div +; AVX512: Found an estimated cost of 15 for instruction: %div } define <8 x i32> @test2(<8 x i32> %a) { @@ -15,8 +22,10 @@ define <8 x i32> @test2(<8 x i32> %a) { ret <8 x i32> %div ; CHECK: 'Cost Model Analysis' for function 'test2': -; SSE2: Found an estimated cost of 30 for instruction: %div +; SSE: Found an estimated cost of 30 for instruction: %div +; AVX1: Found an estimated cost of 30 for instruction: %div ; AVX2: Found an estimated cost of 15 for instruction: %div +; AVX512: Found an estimated cost of 15 for instruction: %div } define <8 x i16> @test3(<8 x i16> %a) { @@ -24,8 +33,9 @@ define <8 x i16> @test3(<8 x i16> %a) { ret <8 x i16> %div ; CHECK: 'Cost Model Analysis' for function 'test3': -; SSE2: Found an estimated cost of 6 for instruction: %div -; AVX2: Found an estimated cost of 6 for instruction: %div +; SSE: Found an estimated cost of 6 for instruction: %div +; AVX: Found an estimated cost of 6 for instruction: %div +; AVX512: Found an estimated cost of 6 for instruction: %div } define <16 x i16> @test4(<16 x i16> %a) { @@ -33,8 +43,10 @@ define <16 x i16> @test4(<16 x i16> %a) { ret <16 x i16> %div ; CHECK: 'Cost Model Analysis' for function 'test4': -; SSE2: Found an estimated cost of 12 for instruction: %div +; SSE: Found an estimated cost of 12 for instruction: %div +; AVX1: Found an estimated cost of 12 for instruction: %div ; AVX2: Found an estimated cost of 6 for instruction: %div +; AVX512: Found an estimated cost of 6 for instruction: %div } define <8 x i16> @test5(<8 x i16> %a) { @@ -42,8 +54,9 @@ define <8 x i16> @test5(<8 x i16> %a) { ret <8 x i16> %div ; CHECK: 'Cost Model Analysis' for function 'test5': -; SSE2: Found an estimated cost of 6 for instruction: %div -; AVX2: Found an estimated cost of 6 for instruction: %div +; SSE: Found an estimated cost of 6 for instruction: %div +; AVX: Found an estimated cost of 6 for instruction: %div +; AVX512: Found an estimated cost of 6 for instruction: %div } define <16 x i16> @test6(<16 x i16> %a) { @@ -51,8 +64,10 @@ define <16 x i16> @test6(<16 x i16> %a) { ret <16 x i16> %div ; CHECK: 'Cost Model Analysis' for function 'test6': -; SSE2: Found an estimated cost of 12 for instruction: %div +; SSE: Found an estimated cost of 12 for instruction: %div +; AVX1: Found an estimated cost of 12 for instruction: %div ; AVX2: Found an estimated cost of 6 for instruction: %div +; AVX512: Found an estimated cost of 6 for instruction: %div } define <16 x i8> @test7(<16 x i8> %a) { @@ -60,8 +75,9 @@ define <16 x i8> @test7(<16 x i8> %a) { ret <16 x i8> %div ; CHECK: 'Cost Model Analysis' for function 'test7': -; SSE2: Found an estimated cost of 320 for instruction: %div -; AVX2: Found an estimated cost of 320 for instruction: %div +; SSE: Found an estimated cost of 320 for instruction: %div +; AVX: Found an estimated cost of 320 for instruction: %div +; AVX512: Found an estimated cost of 320 for instruction: %div } define <4 x i32> @test8(<4 x i32> %a) { @@ -69,8 +85,9 @@ define <4 x i32> @test8(<4 x i32> %a) { ret <4 x i32> %div ; CHECK: 'Cost Model Analysis' for function 'test8': -; SSE2: Found an estimated cost of 19 for instruction: %div -; AVX2: Found an estimated cost of 15 for instruction: %div +; SSE: Found an estimated cost of 19 for instruction: %div +; AVX: Found an estimated cost of 15 for instruction: %div +; AVX512: Found an estimated cost of 15 for instruction: %div } define <8 x i32> @test9(<8 x i32> %a) { @@ -78,8 +95,10 @@ define <8 x i32> @test9(<8 x i32> %a) { ret <8 x i32> %div ; CHECK: 'Cost Model Analysis' for function 'test9': -; SSE2: Found an estimated cost of 38 for instruction: %div +; SSE: Found an estimated cost of 38 for instruction: %div +; AVX1: Found an estimated cost of 38 for instruction: %div ; AVX2: Found an estimated cost of 15 for instruction: %div +; AVX512: Found an estimated cost of 15 for instruction: %div } define <8 x i32> @test10(<8 x i32> %a) { @@ -87,6 +106,17 @@ define <8 x i32> @test10(<8 x i32> %a) { ret <8 x i32> %div ; CHECK: 'Cost Model Analysis' for function 'test10': -; SSE2: Found an estimated cost of 160 for instruction: %div -; AVX2: Found an estimated cost of 160 for instruction: %div +; SSE: Found an estimated cost of 160 for instruction: %div +; AVX: Found an estimated cost of 160 for instruction: %div +; AVX512: Found an estimated cost of 160 for instruction: %div +} + +define <16 x i32> @test11(<16 x i32> %a) { + %div = sdiv <16 x i32> %a, + ret <16 x i32> %div + +; CHECK: 'Cost Model Analysis' for function 'test11': +; SSE: Found an estimated cost of 320 for instruction: %div +; AVX: Found an estimated cost of 320 for instruction: %div +; AVX512: Found an estimated cost of 320 for instruction: %div } diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll index e53e40b57e1..888164df75f 100644 --- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll +++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll @@ -1,9 +1,12 @@ -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK -check-prefix=XOP --check-prefix=XOPAVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK -check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW ; Verify the cost of vector arithmetic shift right instructions. @@ -17,6 +20,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41: Found an estimated cost of 12 for instruction: %shift ; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift +; AVX512: Found an estimated cost of 4 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = ashr <2 x i64> %a, %b ret <2 x i64> %shift @@ -28,17 +32,31 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift +; AVX512: Found an estimated cost of 4 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = ashr <4 x i64> %a, %b ret <4 x i64> %shift } +define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64': +; SSE2: Found an estimated cost of 48 for instruction: %shift +; SSE41: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX2: Found an estimated cost of 8 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = ashr <8 x i64> %a, %b + ret <8 x i64> %shift +} + define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 12 for instruction: %shift ; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = ashr <4 x i32> %a, %b @@ -51,18 +69,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = ashr <8 x i32> %a, %b ret <8 x i32> %shift } +define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32': +; SSE2: Found an estimated cost of 64 for instruction: %shift +; SSE41: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = ashr <16 x i32> %a, %b + ret <16 x i32> %shift +} + define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 14 for instruction: %shift ; AVX: Found an estimated cost of 14 for instruction: %shift ; AVX2: Found an estimated cost of 14 for instruction: %shift +; AVX512: Found an estimated cost of 14 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = ashr <8 x i16> %a, %b ret <8 x i16> %shift @@ -74,17 +107,32 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE41: Found an estimated cost of 28 for instruction: %shift ; AVX: Found an estimated cost of 28 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift +; AVX512: Found an estimated cost of 10 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift } +define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16': +; SSE2: Found an estimated cost of 128 for instruction: %shift +; SSE41: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX2: Found an estimated cost of 20 for instruction: %shift +; AVX512F: Found an estimated cost of 20 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = ashr <32 x i16> %a, %b + ret <32 x i16> %shift +} + define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8': ; SSE2: Found an estimated cost of 54 for instruction: %shift ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 24 for instruction: %shift +; AVX512: Found an estimated cost of 24 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = ashr <16 x i8> %a, %b ret <16 x i8> %shift @@ -96,11 +144,26 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; SSE41: Found an estimated cost of 48 for instruction: %shift ; AVX: Found an estimated cost of 48 for instruction: %shift ; AVX2: Found an estimated cost of 24 for instruction: %shift +; AVX512F: Found an estimated cost of 24 for instruction: %shift +; AVX512BW: Found an estimated cost of 24 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift } +define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8': +; SSE2: Found an estimated cost of 216 for instruction: %shift +; SSE41: Found an estimated cost of 96 for instruction: %shift +; AVX: Found an estimated cost of 96 for instruction: %shift +; AVX2: Found an estimated cost of 48 for instruction: %shift +; AVX512F: Found an estimated cost of 48 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = ashr <64 x i8> %a, %b + ret <64 x i8> %shift +} + ; ; Uniform Variable Shifts ; @@ -111,6 +174,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41: Found an estimated cost of 12 for instruction: %shift ; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift +; AVX512: Found an estimated cost of 4 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer %shift = ashr <2 x i64> %a, %splat @@ -123,18 +187,33 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift +; AVX512: Found an estimated cost of 4 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer %shift = ashr <4 x i64> %a, %splat ret <4 x i64> %shift } +define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64': +; SSE2: Found an estimated cost of 48 for instruction: %shift +; SSE41: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX2: Found an estimated cost of 8 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer + %shift = ashr <8 x i64> %a, %splat + ret <8 x i64> %shift +} + define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 12 for instruction: %shift ; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer @@ -148,6 +227,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer @@ -155,12 +235,27 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ret <8 x i32> %shift } +define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32': +; SSE2: Found an estimated cost of 64 for instruction: %shift +; SSE41: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer + %shift = ashr <16 x i32> %a, %splat + ret <16 x i32> %shift +} + define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 14 for instruction: %shift ; AVX: Found an estimated cost of 14 for instruction: %shift ; AVX2: Found an estimated cost of 14 for instruction: %shift +; AVX512: Found an estimated cost of 14 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer %shift = ashr <8 x i16> %a, %splat @@ -173,18 +268,34 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE41: Found an estimated cost of 28 for instruction: %shift ; AVX: Found an estimated cost of 28 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift +; AVX512: Found an estimated cost of 10 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer %shift = ashr <16 x i16> %a, %splat ret <16 x i16> %shift } +define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16': +; SSE2: Found an estimated cost of 128 for instruction: %shift +; SSE41: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX2: Found an estimated cost of 20 for instruction: %shift +; AVX512F: Found an estimated cost of 20 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer + %shift = ashr <32 x i16> %a, %splat + ret <32 x i16> %shift +} + define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8': ; SSE2: Found an estimated cost of 54 for instruction: %shift ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 24 for instruction: %shift +; AVX512: Found an estimated cost of 24 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer %shift = ashr <16 x i8> %a, %splat @@ -197,12 +308,27 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; SSE41: Found an estimated cost of 48 for instruction: %shift ; AVX: Found an estimated cost of 48 for instruction: %shift ; AVX2: Found an estimated cost of 24 for instruction: %shift +; AVX512: Found an estimated cost of 24 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = ashr <32 x i8> %a, %splat ret <32 x i8> %shift } +define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8': +; SSE2: Found an estimated cost of 216 for instruction: %shift +; SSE41: Found an estimated cost of 96 for instruction: %shift +; AVX: Found an estimated cost of 96 for instruction: %shift +; AVX2: Found an estimated cost of 48 for instruction: %shift +; AVX512F: Found an estimated cost of 48 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer + %shift = ashr <64 x i8> %a, %splat + ret <64 x i8> %shift +} + ; ; Constant Shifts ; @@ -213,6 +339,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { ; SSE41: Found an estimated cost of 12 for instruction: %shift ; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift +; AVX512: Found an estimated cost of 4 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = ashr <2 x i64> %a, ret <2 x i64> %shift @@ -224,17 +351,31 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift +; AVX512: Found an estimated cost of 4 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = ashr <4 x i64> %a, ret <4 x i64> %shift } +define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64': +; SSE2: Found an estimated cost of 48 for instruction: %shift +; SSE41: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX2: Found an estimated cost of 8 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = ashr <8 x i64> %a, + ret <8 x i64> %shift +} + define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 12 for instruction: %shift ; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = ashr <4 x i32> %a, @@ -247,18 +388,33 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = ashr <8 x i32> %a, ret <8 x i32> %shift } +define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32': +; SSE2: Found an estimated cost of 64 for instruction: %shift +; SSE41: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = ashr <16 x i32> %a, + ret <16 x i32> %shift +} + define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 14 for instruction: %shift ; AVX: Found an estimated cost of 14 for instruction: %shift ; AVX2: Found an estimated cost of 14 for instruction: %shift +; AVX512: Found an estimated cost of 14 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = ashr <8 x i16> %a, ret <8 x i16> %shift @@ -270,17 +426,32 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; SSE41: Found an estimated cost of 28 for instruction: %shift ; AVX: Found an estimated cost of 28 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift +; AVX512: Found an estimated cost of 10 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = ashr <16 x i16> %a, ret <16 x i16> %shift } +define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16': +; SSE2: Found an estimated cost of 128 for instruction: %shift +; SSE41: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX2: Found an estimated cost of 20 for instruction: %shift +; AVX512F: Found an estimated cost of 20 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = ashr <32 x i16> %a, + ret <32 x i16> %shift +} + define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8': ; SSE2: Found an estimated cost of 54 for instruction: %shift ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 24 for instruction: %shift +; AVX512: Found an estimated cost of 24 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -292,11 +463,25 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; SSE41: Found an estimated cost of 48 for instruction: %shift ; AVX: Found an estimated cost of 48 for instruction: %shift ; AVX2: Found an estimated cost of 24 for instruction: %shift +; AVX512: Found an estimated cost of 24 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = ashr <32 x i8> %a, ret <32 x i8> %shift } +define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8': +; SSE2: Found an estimated cost of 216 for instruction: %shift +; SSE41: Found an estimated cost of 96 for instruction: %shift +; AVX: Found an estimated cost of 96 for instruction: %shift +; AVX2: Found an estimated cost of 48 for instruction: %shift +; AVX512F: Found an estimated cost of 48 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = ashr <64 x i8> %a, + ret <64 x i8> %shift +} + ; ; Uniform Constant Shifts ; @@ -307,6 +492,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) { ; SSE41: Found an estimated cost of 4 for instruction: %shift ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift +; AVX512: Found an estimated cost of 4 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = ashr <2 x i64> %a, ret <2 x i64> %shift @@ -318,17 +504,31 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { ; SSE41: Found an estimated cost of 8 for instruction: %shift ; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift +; AVX512: Found an estimated cost of 4 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = ashr <4 x i64> %a, ret <4 x i64> %shift } +define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64': +; SSE2: Found an estimated cost of 16 for instruction: %shift +; SSE41: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX2: Found an estimated cost of 8 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = ashr <8 x i64> %a, + ret <8 x i64> %shift +} + define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i32': ; SSE2: Found an estimated cost of 1 for instruction: %shift ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = ashr <4 x i32> %a, @@ -341,18 +541,33 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = ashr <8 x i32> %a, ret <8 x i32> %shift } +define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = ashr <16 x i32> %a, + ret <16 x i32> %shift +} + define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i16': ; SSE2: Found an estimated cost of 1 for instruction: %shift ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = ashr <8 x i16> %a, ret <8 x i16> %shift @@ -364,17 +579,32 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift +; AVX512: Found an estimated cost of 10 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = ashr <16 x i16> %a, ret <16 x i16> %shift } +define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX2: Found an estimated cost of 20 for instruction: %shift +; AVX512F: Found an estimated cost of 20 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = ashr <32 x i16> %a, + ret <32 x i16> %shift +} + define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift +; AVX512: Found an estimated cost of 4 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -386,7 +616,21 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; SSE41: Found an estimated cost of 8 for instruction: %shift ; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 24 for instruction: %shift +; AVX512: Found an estimated cost of 24 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = ashr <32 x i8> %a, ret <32 x i8> %shift } + +define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8': +; SSE2: Found an estimated cost of 16 for instruction: %shift +; SSE41: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX2: Found an estimated cost of 48 for instruction: %shift +; AVX512F: Found an estimated cost of 48 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = ashr <64 x i8> %a, + ret <64 x i8> %shift +} diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll index 6d028268ea5..b3382253739 100644 --- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll +++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll @@ -1,9 +1,12 @@ -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW ; Verify the cost of vector logical shift right instructions. @@ -17,6 +20,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41: Found an estimated cost of 4 for instruction: %shift ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <2 x i64> %a, %b @@ -29,18 +33,33 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41: Found an estimated cost of 8 for instruction: %shift ; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <4 x i64> %a, %b ret <4 x i64> %shift } +define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64': +; SSE2: Found an estimated cost of 16 for instruction: %shift +; SSE41: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = lshr <8 x i64> %a, %b + ret <8 x i64> %shift +} + define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 11 for instruction: %shift ; AVX: Found an estimated cost of 11 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <4 x i32> %a, %b @@ -53,18 +72,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE41: Found an estimated cost of 22 for instruction: %shift ; AVX: Found an estimated cost of 22 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <8 x i32> %a, %b ret <8 x i32> %shift } +define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32': +; SSE2: Found an estimated cost of 64 for instruction: %shift +; SSE41: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = lshr <16 x i32> %a, %b + ret <16 x i32> %shift +} + define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 14 for instruction: %shift ; AVX: Found an estimated cost of 14 for instruction: %shift ; AVX2: Found an estimated cost of 14 for instruction: %shift +; AVX512: Found an estimated cost of 14 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = lshr <8 x i16> %a, %b ret <8 x i16> %shift @@ -76,17 +110,32 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE41: Found an estimated cost of 28 for instruction: %shift ; AVX: Found an estimated cost of 28 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift +; AVX512: Found an estimated cost of 10 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift } +define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16': +; SSE2: Found an estimated cost of 128 for instruction: %shift +; SSE41: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX2: Found an estimated cost of 20 for instruction: %shift +; AVX512F: Found an estimated cost of 20 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = lshr <32 x i16> %a, %b + ret <32 x i16> %shift +} + define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8': ; SSE2: Found an estimated cost of 26 for instruction: %shift ; SSE41: Found an estimated cost of 12 for instruction: %shift ; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 12 for instruction: %shift +; AVX512: Found an estimated cost of 12 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = lshr <16 x i8> %a, %b ret <16 x i8> %shift @@ -98,11 +147,25 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = lshr <32 x i8> %a, %b ret <32 x i8> %shift } +define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8': +; SSE2: Found an estimated cost of 104 for instruction: %shift +; SSE41: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX2: Found an estimated cost of 22 for instruction: %shift +; AVX512F: Found an estimated cost of 22 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = lshr <64 x i8> %a, %b + ret <64 x i8> %shift +} + ; ; Uniform Variable Shifts ; @@ -113,6 +176,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41: Found an estimated cost of 4 for instruction: %shift ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -126,6 +190,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41: Found an estimated cost of 8 for instruction: %shift ; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer @@ -133,12 +198,27 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ret <4 x i64> %shift } +define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64': +; SSE2: Found an estimated cost of 16 for instruction: %shift +; SSE41: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer + %shift = lshr <8 x i64> %a, %splat + ret <8 x i64> %shift +} + define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 11 for instruction: %shift ; AVX: Found an estimated cost of 11 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer @@ -152,6 +232,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE41: Found an estimated cost of 22 for instruction: %shift ; AVX: Found an estimated cost of 22 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer @@ -159,12 +240,27 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ret <8 x i32> %shift } +define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32': +; SSE2: Found an estimated cost of 64 for instruction: %shift +; SSE41: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer + %shift = lshr <16 x i32> %a, %splat + ret <16 x i32> %shift +} + define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 14 for instruction: %shift ; AVX: Found an estimated cost of 14 for instruction: %shift ; AVX2: Found an estimated cost of 14 for instruction: %shift +; AVX512: Found an estimated cost of 14 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer %shift = lshr <8 x i16> %a, %splat @@ -177,18 +273,34 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE41: Found an estimated cost of 28 for instruction: %shift ; AVX: Found an estimated cost of 28 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift +; AVX512: Found an estimated cost of 10 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer %shift = lshr <16 x i16> %a, %splat ret <16 x i16> %shift } +define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16': +; SSE2: Found an estimated cost of 128 for instruction: %shift +; SSE41: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX2: Found an estimated cost of 20 for instruction: %shift +; AVX512F: Found an estimated cost of 20 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer + %shift = lshr <32 x i16> %a, %splat + ret <32 x i16> %shift +} + define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8': ; SSE2: Found an estimated cost of 26 for instruction: %shift ; SSE41: Found an estimated cost of 12 for instruction: %shift ; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 12 for instruction: %shift +; AVX512: Found an estimated cost of 12 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer %shift = lshr <16 x i8> %a, %splat @@ -201,12 +313,27 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = lshr <32 x i8> %a, %splat ret <32 x i8> %shift } +define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8': +; SSE2: Found an estimated cost of 104 for instruction: %shift +; SSE41: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX2: Found an estimated cost of 22 for instruction: %shift +; AVX512F: Found an estimated cost of 22 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer + %shift = lshr <64 x i8> %a, %splat + ret <64 x i8> %shift +} + ; ; Constant Shifts ; @@ -217,6 +344,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { ; SSE41: Found an estimated cost of 4 for instruction: %shift ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <2 x i64> %a, @@ -229,18 +357,33 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { ; SSE41: Found an estimated cost of 8 for instruction: %shift ; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <4 x i64> %a, ret <4 x i64> %shift } +define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64': +; SSE2: Found an estimated cost of 16 for instruction: %shift +; SSE41: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = lshr <8 x i64> %a, + ret <8 x i64> %shift +} + define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 11 for instruction: %shift ; AVX: Found an estimated cost of 11 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <4 x i32> %a, @@ -253,18 +396,33 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { ; SSE41: Found an estimated cost of 22 for instruction: %shift ; AVX: Found an estimated cost of 22 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <8 x i32> %a, ret <8 x i32> %shift } +define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32': +; SSE2: Found an estimated cost of 64 for instruction: %shift +; SSE41: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = lshr <16 x i32> %a, + ret <16 x i32> %shift +} + define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 14 for instruction: %shift ; AVX: Found an estimated cost of 14 for instruction: %shift ; AVX2: Found an estimated cost of 14 for instruction: %shift +; AVX512: Found an estimated cost of 14 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = lshr <8 x i16> %a, ret <8 x i16> %shift @@ -276,17 +434,32 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; SSE41: Found an estimated cost of 28 for instruction: %shift ; AVX: Found an estimated cost of 28 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift +; AVX512: Found an estimated cost of 10 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = lshr <16 x i16> %a, ret <16 x i16> %shift } +define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16': +; SSE2: Found an estimated cost of 128 for instruction: %shift +; SSE41: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX2: Found an estimated cost of 20 for instruction: %shift +; AVX512F: Found an estimated cost of 20 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = lshr <32 x i16> %a, + ret <32 x i16> %shift +} + define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8': ; SSE2: Found an estimated cost of 26 for instruction: %shift ; SSE41: Found an estimated cost of 12 for instruction: %shift ; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 12 for instruction: %shift +; AVX512: Found an estimated cost of 12 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = lshr <16 x i8> %a, ret <16 x i8> %shift @@ -298,11 +471,25 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; SSE41: Found an estimated cost of 24 for instruction: %shift ; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = lshr <32 x i8> %a, ret <32 x i8> %shift } +define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8': +; SSE2: Found an estimated cost of 104 for instruction: %shift +; SSE41: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX2: Found an estimated cost of 22 for instruction: %shift +; AVX512F: Found an estimated cost of 22 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = lshr <64 x i8> %a, + ret <64 x i8> %shift +} + ; ; Uniform Constant Shifts ; @@ -313,6 +500,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) { ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <2 x i64> %a, @@ -325,18 +513,33 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <4 x i64> %a, ret <4 x i64> %shift } +define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = lshr <8 x i64> %a, + ret <8 x i64> %shift +} + define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i32': ; SSE2: Found an estimated cost of 1 for instruction: %shift ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <4 x i32> %a, @@ -349,18 +552,33 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <8 x i32> %a, ret <8 x i32> %shift } +define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = lshr <16 x i32> %a, + ret <16 x i32> %shift +} + define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i16': ; SSE2: Found an estimated cost of 1 for instruction: %shift ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = lshr <8 x i16> %a, ret <8 x i16> %shift @@ -372,17 +590,32 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift +; AVX512: Found an estimated cost of 10 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = lshr <16 x i16> %a, ret <16 x i16> %shift } +define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX2: Found an estimated cost of 20 for instruction: %shift +; AVX512F: Found an estimated cost of 20 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = lshr <32 x i16> %a, + ret <32 x i16> %shift +} + define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8': ; SSE2: Found an estimated cost of 1 for instruction: %shift ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = lshr <16 x i8> %a, ret <16 x i8> %shift @@ -394,7 +627,21 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 4 for instruction: %shift %shift = lshr <32 x i8> %a, ret <32 x i8> %shift } + +define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX2: Found an estimated cost of 22 for instruction: %shift +; AVX512F: Found an estimated cost of 22 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift + %shift = lshr <64 x i8> %a, + ret <64 x i8> %shift +} diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll index 60ba3adea42..804c5a76c31 100644 --- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll +++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll @@ -1,9 +1,12 @@ -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW ; Verify the cost of vector shift left instructions. @@ -18,6 +21,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41: Found an estimated cost of 4 for instruction: %shift ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 1 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <2 x i64> %a, %b @@ -30,18 +34,33 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41: Found an estimated cost of 8 for instruction: %shift ; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <4 x i64> %a, %b ret <4 x i64> %shift } +define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64': +; SSE2: Found an estimated cost of 16 for instruction: %shift +; SSE41: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = shl <8 x i64> %a, %b + ret <8 x i64> %shift +} + define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32': ; SSE2: Found an estimated cost of 10 for instruction: %shift ; SSE41: Found an estimated cost of 10 for instruction: %shift ; AVX: Found an estimated cost of 10 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 1 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <4 x i32> %a, %b @@ -54,18 +73,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE41: Found an estimated cost of 20 for instruction: %shift ; AVX: Found an estimated cost of 20 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <8 x i32> %a, %b ret <8 x i32> %shift } +define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32': +; SSE2: Found an estimated cost of 40 for instruction: %shift +; SSE41: Found an estimated cost of 40 for instruction: %shift +; AVX: Found an estimated cost of 40 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = shl <16 x i32> %a, %b + ret <16 x i32> %shift +} + define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 14 for instruction: %shift ; AVX: Found an estimated cost of 14 for instruction: %shift ; AVX2: Found an estimated cost of 14 for instruction: %shift +; AVX512: Found an estimated cost of 14 for instruction: %shift ; XOP: Found an estimated cost of 1 for instruction: %shift %shift = shl <8 x i16> %a, %b ret <8 x i16> %shift @@ -77,17 +111,32 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE41: Found an estimated cost of 28 for instruction: %shift ; AVX: Found an estimated cost of 28 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift +; AVX512: Found an estimated cost of 10 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift } +define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16': +; SSE2: Found an estimated cost of 128 for instruction: %shift +; SSE41: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX2: Found an estimated cost of 20 for instruction: %shift +; AVX512F: Found an estimated cost of 20 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 4 for instruction: %shift + %shift = shl <32 x i16> %a, %b + ret <32 x i16> %shift +} + define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8': ; SSE2: Found an estimated cost of 26 for instruction: %shift ; SSE41: Found an estimated cost of 11 for instruction: %shift ; AVX: Found an estimated cost of 11 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 1 for instruction: %shift %shift = shl <16 x i8> %a, %b ret <16 x i8> %shift @@ -99,11 +148,25 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; SSE41: Found an estimated cost of 22 for instruction: %shift ; AVX: Found an estimated cost of 22 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = shl <32 x i8> %a, %b ret <32 x i8> %shift } +define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { +; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8': +; SSE2: Found an estimated cost of 104 for instruction: %shift +; SSE41: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX2: Found an estimated cost of 22 for instruction: %shift +; AVX512F: Found an estimated cost of 22 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 4 for instruction: %shift + %shift = shl <64 x i8> %a, %b + ret <64 x i8> %shift +} + ; ; Uniform Variable Shifts ; @@ -114,6 +177,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE41: Found an estimated cost of 4 for instruction: %shift ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 1 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -127,6 +191,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ; SSE41: Found an estimated cost of 8 for instruction: %shift ; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer @@ -134,12 +199,27 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ret <4 x i64> %shift } +define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64': +; SSE2: Found an estimated cost of 16 for instruction: %shift +; SSE41: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer + %shift = shl <8 x i64> %a, %splat + ret <8 x i64> %shift +} + define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32': ; SSE2: Found an estimated cost of 10 for instruction: %shift ; SSE41: Found an estimated cost of 10 for instruction: %shift ; AVX: Found an estimated cost of 10 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 1 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer @@ -153,6 +233,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE41: Found an estimated cost of 20 for instruction: %shift ; AVX: Found an estimated cost of 20 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer @@ -160,12 +241,27 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ret <8 x i32> %shift } +define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32': +; SSE2: Found an estimated cost of 40 for instruction: %shift +; SSE41: Found an estimated cost of 40 for instruction: %shift +; AVX: Found an estimated cost of 40 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer + %shift = shl <16 x i32> %a, %splat + ret <16 x i32> %shift +} + define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 14 for instruction: %shift ; AVX: Found an estimated cost of 14 for instruction: %shift ; AVX2: Found an estimated cost of 14 for instruction: %shift +; AVX512: Found an estimated cost of 14 for instruction: %shift ; XOP: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer %shift = shl <8 x i16> %a, %splat @@ -178,18 +274,34 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE41: Found an estimated cost of 28 for instruction: %shift ; AVX: Found an estimated cost of 28 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift +; AVX512: Found an estimated cost of 10 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer %shift = shl <16 x i16> %a, %splat ret <16 x i16> %shift } +define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16': +; SSE2: Found an estimated cost of 128 for instruction: %shift +; SSE41: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX2: Found an estimated cost of 20 for instruction: %shift +; AVX512F: Found an estimated cost of 20 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 4 for instruction: %shift + %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer + %shift = shl <32 x i16> %a, %splat + ret <32 x i16> %shift +} + define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8': ; SSE2: Found an estimated cost of 26 for instruction: %shift ; SSE41: Found an estimated cost of 11 for instruction: %shift ; AVX: Found an estimated cost of 11 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 1 for instruction: %shift %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer %shift = shl <16 x i8> %a, %splat @@ -202,12 +314,27 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; SSE41: Found an estimated cost of 22 for instruction: %shift ; AVX: Found an estimated cost of 22 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = shl <32 x i8> %a, %splat ret <32 x i8> %shift } +define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { +; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8': +; SSE2: Found an estimated cost of 104 for instruction: %shift +; SSE41: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX2: Found an estimated cost of 22 for instruction: %shift +; AVX512F: Found an estimated cost of 22 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 4 for instruction: %shift + %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer + %shift = shl <64 x i8> %a, %splat + ret <64 x i8> %shift +} + ; ; Constant Shifts ; @@ -218,6 +345,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) { ; SSE41: Found an estimated cost of 4 for instruction: %shift ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 1 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <2 x i64> %a, @@ -230,18 +358,33 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { ; SSE41: Found an estimated cost of 8 for instruction: %shift ; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <4 x i64> %a, ret <4 x i64> %shift } +define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64': +; SSE2: Found an estimated cost of 16 for instruction: %shift +; SSE41: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = shl <8 x i64> %a, + ret <8 x i64> %shift +} + define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32': ; SSE2: Found an estimated cost of 6 for instruction: %shift ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 1 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <4 x i32> %a, @@ -254,18 +397,33 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <8 x i32> %a, ret <8 x i32> %shift } +define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32': +; SSE2: Found an estimated cost of 24 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = shl <16 x i32> %a, + ret <16 x i32> %shift +} + define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16': ; SSE2: Found an estimated cost of 1 for instruction: %shift ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOP: Found an estimated cost of 1 for instruction: %shift %shift = shl <8 x i16> %a, ret <8 x i16> %shift @@ -277,18 +435,34 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <16 x i16> %a, ret <16 x i16> %shift } +define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512F: Found an estimated cost of 2 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = shl <32 x i16> %a, + ret <32 x i16> %shift +} + define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8': ; SSE2: Found an estimated cost of 26 for instruction: %shift ; SSE41: Found an estimated cost of 11 for instruction: %shift ; AVX: Found an estimated cost of 11 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 1 for instruction: %shift %shift = shl <16 x i8> %a, ret <16 x i8> %shift @@ -300,11 +474,25 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; SSE41: Found an estimated cost of 22 for instruction: %shift ; AVX: Found an estimated cost of 22 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = shl <32 x i8> %a, ret <32 x i8> %shift } +define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { +; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8': +; SSE2: Found an estimated cost of 104 for instruction: %shift +; SSE41: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX2: Found an estimated cost of 22 for instruction: %shift +; AVX512F: Found an estimated cost of 22 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 4 for instruction: %shift + %shift = shl <64 x i8> %a, + ret <64 x i8> %shift +} + ; ; Uniform Constant Shifts ; @@ -315,6 +503,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) { ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 1 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <2 x i64> %a, @@ -327,18 +516,33 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <4 x i64> %a, ret <4 x i64> %shift } +define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = shl <8 x i64> %a, + ret <8 x i64> %shift +} + define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i32': ; SSE2: Found an estimated cost of 1 for instruction: %shift ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 1 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <4 x i32> %a, @@ -351,18 +555,33 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <8 x i32> %a, ret <8 x i32> %shift } +define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = shl <16 x i32> %a, + ret <16 x i32> %shift +} + define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i16': ; SSE2: Found an estimated cost of 1 for instruction: %shift ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOP: Found an estimated cost of 1 for instruction: %shift %shift = shl <8 x i16> %a, ret <8 x i16> %shift @@ -374,18 +593,34 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOPAVX: Found an estimated cost of 2 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <16 x i16> %a, ret <16 x i16> %shift } +define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX2: Found an estimated cost of 2 for instruction: %shift +; AVX512F: Found an estimated cost of 2 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX2: Found an estimated cost of 2 for instruction: %shift + %shift = shl <32 x i16> %a, + ret <32 x i16> %shift +} + define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8': ; SSE2: Found an estimated cost of 1 for instruction: %shift ; SSE41: Found an estimated cost of 1 for instruction: %shift ; AVX: Found an estimated cost of 1 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift +; AVX512: Found an estimated cost of 1 for instruction: %shift ; XOP: Found an estimated cost of 1 for instruction: %shift %shift = shl <16 x i8> %a, ret <16 x i8> %shift @@ -397,11 +632,25 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shift ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift +; AVX512: Found an estimated cost of 11 for instruction: %shift ; XOP: Found an estimated cost of 2 for instruction: %shift %shift = shl <32 x i8> %a, ret <32 x i8> %shift } +define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { +; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8': +; SSE2: Found an estimated cost of 4 for instruction: %shift +; SSE41: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX2: Found an estimated cost of 22 for instruction: %shift +; AVX512F: Found an estimated cost of 22 for instruction: %shift +; AVX512BW: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 4 for instruction: %shift + %shift = shl <64 x i8> %a, + ret <64 x i8> %shift +} + ; ; Special Cases ; diff --git a/test/Bitcode/summary_version.ll b/test/Bitcode/summary_version.ll index dfb9e9b15e7..81025a221bb 100644 --- a/test/Bitcode/summary_version.ll +++ b/test/Bitcode/summary_version.ll @@ -2,7 +2,7 @@ ; RUN: opt -module-summary %s -o - | llvm-bcanalyzer -dump | FileCheck %s ; CHECK: +; CHECK: diff --git a/test/Bitcode/thinlto-function-summary.ll b/test/Bitcode/thinlto-function-summary.ll index e42c55c1c2e..594aaab566d 100644 --- a/test/Bitcode/thinlto-function-summary.ll +++ b/test/Bitcode/thinlto-function-summary.ll @@ -10,7 +10,7 @@ ; BC-NEXT: . ; AdrpAddStr cannot be used when the store uses same ; register as address and value. Indeed, the related diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll index b697b6eced3..c7ba989d933 100644 --- a/test/CodeGen/AArch64/arm64-collect-loh.ll +++ b/test/CodeGen/AArch64/arm64-collect-loh.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-enable-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s -; RUN: llc -mtriple=arm64-linux-gnu -O2 -aarch64-enable-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF +; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s +; RUN: llc -o - %s -mtriple=arm64-linux-gnu -O2 | FileCheck %s --check-prefix=CHECK-ELF ; CHECK-ELF-NOT: .loh ; CHECK-ELF-NOT: AdrpAdrp @@ -633,11 +633,14 @@ define void @setL(<1 x i8> %t) { ; a tuple register to appear in the lowering. Thus, the target ; cpu is required to have the problem reproduced. ; CHECK-LABEL: _uninterestingSub +; CHECK: [[LOH_LABEL0:Lloh[0-9]+]]: ; CHECK: adrp [[ADRP_REG:x[0-9]+]], [[CONSTPOOL:lCPI[0-9]+_[0-9]+]]@PAGE -; CHECK-NEXT: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF] +; CHECK: [[LOH_LABEL1:Lloh[0-9]+]]: +; CHECK: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF] ; The tuple comes from the next instruction. ; CHECK-NEXT: tbl.16b v{{[0-9]+}}, { v{{[0-9]+}}, v{{[0-9]+}} }, v[[IDX]] ; CHECK: ret +; CHECK: .loh AdrpLdr [[LOH_LABEL0]], [[LOH_LABEL1]] define void @uninterestingSub(i8* nocapture %row) #0 { %tmp = bitcast i8* %row to <16 x i8>* %tmp1 = load <16 x i8>, <16 x i8>* %tmp, align 16 @@ -664,10 +667,10 @@ entry: if.then.i: ret void if.end.i: -; CHECK: .loh AdrpAdrp Lloh91, Lloh93 -; CHECK: .loh AdrpLdr Lloh91, Lloh92 -; CHECK: .loh AdrpLdrGot Lloh93, Lloh95 -; CHECK: .loh AdrpLdrGot Lloh94, Lloh96 +; CHECK: .loh AdrpLdrGot +; CHECK: .loh AdrpLdrGot +; CHECK: .loh AdrpAdrp +; CHECK: .loh AdrpLdr %mul.i.i.i = fmul double undef, 1.000000e-06 %add.i.i.i = fadd double undef, %mul.i.i.i %sub.i.i = fsub double %add.i.i.i, undef diff --git a/test/CodeGen/AArch64/loh.mir b/test/CodeGen/AArch64/loh.mir new file mode 100644 index 00000000000..1d08ebdc579 --- /dev/null +++ b/test/CodeGen/AArch64/loh.mir @@ -0,0 +1,193 @@ +# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @func0() { ret void } + + declare void @extfunc() + + @g0 = external global i32 + @g1 = external global i32 + @g2 = external global i32 + @g3 = external global i32 + @g4 = external global i32 + @g5 = external global i32 +... +--- +# Check various LOH variants. Remember that the algorithms walks the basic +# blocks backwards. +# CHECK-LABEL: ********** AArch64 Collect LOH ********** +# CHECK-LABEL: Looking in function func0 +name: func0 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK: Adding MCLOH_AdrpAdrp: + ; CHECK-NEXT: %X1 = ADRP + ; CHECK-NEXT: %X1 = ADRP + ; CHECK-NEXT: Adding MCLOH_AdrpAdrp: + ; CHECK-NEXT: %X1 = ADRP + ; CHECK-NEXT: %X1 = ADRP + ; CHECK-NEXT: Adding MCLOH_AdrpAdrp: + ; CHECK-NEXT: %X0 = ADRP + ; CHECK-NEXT: %X0 = ADRP + %x0 = ADRP target-flags(aarch64-page) @g0 + %x0 = ADRP target-flags(aarch64-page) @g1 + %x1 = ADRP target-flags(aarch64-page) @g2 + %x1 = ADRP target-flags(aarch64-page) @g3 + %x1 = ADRP target-flags(aarch64-page) @g4 + + bb.1: + ; CHECK-NEXT: Adding MCLOH_AdrpAdd: + ; CHECK-NEXT: %X20 = ADRP + ; CHECK-NEXT: %X3 = ADDXri %X20, + ; CHECK-NEXT: Adding MCLOH_AdrpAdd: + ; CHECK-NEXT: %X1 = ADRP + ; CHECK-NEXT: %X1 = ADDXri %X1, + %x1 = ADRP target-flags(aarch64-page) @g0 + %x9 = SUBXri undef %x11, 5, 0 ; should not affect MCLOH formation + %x1 = ADDXri %x1, target-flags(aarch64-pageoff) @g0, 0 + %x20 = ADRP target-flags(aarch64-page) @g0 + BL @extfunc, csr_aarch64_aapcs ; should not clobber X20 + %x3 = ADDXri %x20, target-flags(aarch64-pageoff) @g0, 0 + + bb.2: + ; CHECK-NOT: MCLOH_AdrpAdd + %x9 = ADRP target-flags(aarch64-page) @g0 + BL @extfunc, csr_aarch64_aapcs ; clobbers x9 + ; Verification requires the use of 'undef' in front of the clobbered %x9 + %x9 = ADDXri undef %x9, target-flags(aarch64-pageoff) @g0, 0 + + bb.3: + ; CHECK-NOT: MCLOH_AdrpAdd + %x10 = ADRP target-flags(aarch64-page) @g0 + HINT 0, implicit def %x10 ; clobbers x10 + %x10 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0 + + bb.4: + ; Cannot produce a LOH for multiple users + ; CHECK-NOT: MCLOH_AdrpAdd + %x10 = ADRP target-flags(aarch64-page) @g0 + HINT 0, implicit def %x10 ; clobbers x10 + %x11 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0 + %x12 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0 + + bb.5: + ; CHECK-NEXT: Adding MCLOH_AdrpLdr: + ; CHECK-NEXT: %X5 = ADRP + ; CHECK-NEXT: %S6 = LDRSui %X5, + ; CHECK-NEXT: Adding MCLOH_AdrpLdr: + ; CHECK-NEXT: %X4 = ADRP + ; CHECK-NEXT: %X4 = LDRXui %X4, + %x4 = ADRP target-flags(aarch64-page) @g2 + %x4 = LDRXui %x4, target-flags(aarch64-pageoff) @g2 + %x5 = ADRP target-flags(aarch64-page) @g2 + %s6 = LDRSui %x5, target-flags(aarch64-pageoff) @g2 + + bb.6: + ; CHECK-NEXT: Adding MCLOH_AdrpLdrGot: + ; CHECK-NEXT: %X5 = ADRP + ; CHECK-NEXT: %X6 = LDRXui %X5, + ; CHECK-NEXT: Adding MCLOH_AdrpLdrGot: + ; CHECK-NEXT: %X4 = ADRP + ; CHECK-NEXT: %X4 = LDRXui %X4, + %x4 = ADRP target-flags(aarch64-page, aarch64-got) @g2 + %x4 = LDRXui %x4, target-flags(aarch64-pageoff, aarch64-got) @g2 + %x5 = ADRP target-flags(aarch64-page, aarch64-got) @g2 + %x6 = LDRXui %x5, target-flags(aarch64-pageoff, aarch64-got) @g2 + + bb.7: + ; CHECK-NOT: Adding MCLOH_AdrpLdrGot: + ; Loading a float value from a GOT table makes no sense so this should not + ; produce an LOH. + %x11 = ADRP target-flags(aarch64-page, aarch64-got) @g5 + %s11 = LDRSui %x11, target-flags(aarch64-pageoff, aarch64-got) @g5 + + bb.8: + ; CHECK-NEXT: Adding MCLOH_AdrpAddLdr: + ; CHECK-NEXT: %X7 = ADRP [TF=1] + ; CHECK-NEXT: %X8 = ADDXri %X7, + ; CHECK-NEXT: %D1 = LDRDui %X8, 8 + %x7 = ADRP target-flags(aarch64-page) @g3 + %x8 = ADDXri %x7, target-flags(aarch64-pageoff) @g3, 0 + %d1 = LDRDui %x8, 8 + + bb.9: + ; CHECK-NEXT: Adding MCLOH_AdrpAdd: + ; CHECK-NEXT: %X3 = ADRP + ; CHECK-NEXT: %X3 = ADDXri %X3, + ; CHECK-NEXT: Adding MCLOH_AdrpAdd: + ; CHECK-NEXT: %X5 = ADRP + ; CHECK-NEXT: %X2 = ADDXri %X5, + ; CHECK-NEXT: Adding MCLOH_AdrpAddStr: + ; CHECK-NEXT: %X1 = ADRP + ; CHECK-NEXT: %X1 = ADDXri %X1, + ; CHECK-NEXT: STRXui %XZR, %X1, 16 + %x1 = ADRP target-flags(aarch64-page) @g3 + %x1 = ADDXri %x1, target-flags(aarch64-pageoff) @g3, 0 + STRXui %xzr, %x1, 16 + + ; This sequence should just produce an AdrpAdd (not AdrpAddStr) + %x5 = ADRP target-flags(aarch64-page) @g3 + %x2 = ADDXri %x5, target-flags(aarch64-pageoff) @g3, 0 + STRXui %x2, undef %x11, 16 + + ; This sequence should just produce an AdrpAdd (not AdrpAddStr) + %x3 = ADRP target-flags(aarch64-page) @g3 + %x3 = ADDXri %x3, target-flags(aarch64-pageoff) @g3, 0 + STRXui %x3, %x3, 16 + + bb.10: + ; CHECK-NEXT: Adding MCLOH_AdrpLdr: + ; CHECK-NEXT: %X2 = ADRP + ; CHECK-NEXT: %X2 = LDRXui %X2, + ; CHECK-NEXT: Adding MCLOH_AdrpLdrGotLdr: + ; CHECK-NEXT: %X1 = ADRP + ; CHECK-NEXT: %X1 = LDRXui %X1, + ; CHECK-NEXT: %X1 = LDRXui %X1, 24 + %x1 = ADRP target-flags(aarch64-page, aarch64-got) @g4 + %x1 = LDRXui %x1, target-flags(aarch64-pageoff, aarch64-got) @g4 + %x1 = LDRXui %x1, 24 + ; Should just produce a MCLOH_AdrpLdr (not MCLOH_AdrpLdrGotLdr) + %x2 = ADRP target-flags(aarch64-page) @g3 + %x2 = LDRXui %x2, target-flags(aarch64-pageoff) @g3 + %x2 = LDRXui %x2, 24 + + bb.11: + ; CHECK-NEXT: Adding MCLOH_AdrpLdr + ; CHECK-NEXT: %X5 = ADRP + ; CHECK-NEXT: %X5 = LDRXui %X5, + ; CHECK-NEXT: Adding MCLOH_AdrpLdrGotStr: + ; CHECK-NEXT: %X1 = ADRP + ; CHECK-NEXT: %X1 = LDRXui %X1, + ; CHECK-NEXT: STRXui %XZR, %X1, 32 + %x1 = ADRP target-flags(aarch64-page, aarch64-got) @g4 + %x1 = LDRXui %x1, target-flags(aarch64-pageoff, aarch64-got) @g4 + STRXui %xzr, %x1, 32 + ; Should just produce a MCLOH_AdrpLdr (not MCLOH_AdrpLdrGotStr) + %x5 = ADRP target-flags(aarch64-page) @g1 + %x5 = LDRXui %x5, target-flags(aarch64-pageoff) @g1 + STRXui undef %x11, %x5, 32 + + bb.12: + ; CHECK-NOT: MCLOH_AdrpAdrp + ; CHECK: Adding MCLOH_AdrpAddLdr + ; %X9 = ADRP + ; %X9 = ADDXri %X9, + ; %X5 = LDRXui %X9, 0 + %x9 = ADRP target-flags(aarch64-page, aarch64-got) @g4 + %x9 = ADDXri %x9, target-flags(aarch64-pageoff, aarch64-got) @g4, 0 + %x5 = LDRXui %x9, 0 + %x9 = ADRP target-flags(aarch64-page, aarch64-got) @g5 + + bb.13: + successors: %bb.14 + ; Cannot produce a LOH for multiple users + ; CHECK-NOT: MCLOH_AdrpAdd + %x10 = ADRP target-flags(aarch64-page) @g0 + %x11 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0 + B %bb.14 + + bb.14: + liveins: %x10 + %x12 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0 +... diff --git a/test/CodeGen/AArch64/machine-scheduler.mir b/test/CodeGen/AArch64/machine-scheduler.mir index e7e0dda53c5..933afdb6da9 100644 --- a/test/CodeGen/AArch64/machine-scheduler.mir +++ b/test/CodeGen/AArch64/machine-scheduler.mir @@ -21,8 +21,9 @@ # CHECK: LDRWui %x0, 0 # CHECK: LDRWui %x0, 1 # CHECK: STRWui %w1, %x0, 2 -name: load_imp-def -body: | +name: load_imp-def +tracksRegLiveness: true +body: | bb.0.entry: liveins: %w1, %x0 %w8 = LDRWui %x0, 1, implicit-def %x8 :: (load 4 from %ir.0) diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll index 28c8b5d73b0..d9662b69b12 100644 --- a/test/CodeGen/AMDGPU/hsa-func.ll +++ b/test/CodeGen/AMDGPU/hsa-func.ll @@ -30,12 +30,11 @@ ; ELF: Type: Function (0x2) ; ELF: } +; HSA: .text ; HSA: .hsa_code_object_version 2,1 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU" -; HSA: .text - ; HSA-NOT: .amdgpu_hsa_kernel simple ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll index 78a5cdb576f..12c15441c0f 100644 --- a/test/CodeGen/AMDGPU/hsa.ll +++ b/test/CodeGen/AMDGPU/hsa.ll @@ -34,12 +34,12 @@ ; ELF: Type: AMDGPU_HSA_KERNEL (0xA) ; ELF: } +; HSA-NOT: .AMDGPU.config +; HSA: .text ; HSA: .hsa_code_object_version 2,1 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU" -; HSA: .text - ; HSA: .amdgpu_hsa_kernel simple ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t diff --git a/test/CodeGen/Generic/cfi-sections.ll b/test/CodeGen/Generic/cfi-sections.ll new file mode 100644 index 00000000000..6e721d6df70 --- /dev/null +++ b/test/CodeGen/Generic/cfi-sections.ll @@ -0,0 +1,39 @@ +; When using Itanium ABI, do not emit .debug_frame. +; RUNT: llc -mtriple=i386--linux -o - < %s | FileCheck %s -check-prefix=WITHOUT +; RUNT: llc -mtriple=armv7-netbsd-eabi -o - < %s | FileCheck %s -check-prefix=WITHOUT + +; When using EHABI, do emit .debug_frame. +; RUN: llc -mtriple=arm-linux -mcpu=cortex-a7 -mattr=v7 -o - < %s | FileCheck %s -check-prefix=WITH + +; REQUIRES: x86-registered-target +; REQUIRES: arm-registered-target + +; WITH: .cfi_sections .debug_frame +; WITHOUT-NOT: .cfi_sections + +define i32 @foo() #0 !dbg !7 { + %1 = call i32 @bar() + %2 = call i32 @bar() + %3 = add nsw i32 %1, %2 + ret i32 %3 +} + +declare i32 @bar() #1 + +attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5, !6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "cfi-sections.cc", directory: ".") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{i32 1, !"min_enum_size", i32 4} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) +!8 = !DISubroutineType(types: !9) +!9 = !{!10} +!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) diff --git a/test/CodeGen/MIR/AArch64/spill-fold.mir b/test/CodeGen/MIR/AArch64/spill-fold.mir new file mode 100644 index 00000000000..05e7f7521ed --- /dev/null +++ b/test/CodeGen/MIR/AArch64/spill-fold.mir @@ -0,0 +1,82 @@ +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass greedy -verify-machineinstrs -o - %s | FileCheck %s +--- | + define i64 @test_subreg_spill_fold() { ret i64 0 } + define i64 @test_subreg_spill_fold2() { ret i64 0 } + define i64 @test_subreg_spill_fold3() { ret i64 0 } + define i64 @test_subreg_fill_fold() { ret i64 0 } + define double @test_subreg_fill_fold2() { ret double 0.0 } +... +--- +# CHECK-LABEL: name: test_subreg_spill_fold +# Ensure that the spilled subreg COPY is eliminated and folded into the spill store. +name: test_subreg_spill_fold +registers: + - { id: 0, class: gpr64 } +body: | + bb.0: + ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0) + undef %0.sub_32 = COPY %wzr + INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp + %x0 = COPY %0 + RET_ReallyLR implicit %x0 +... +--- +# CHECK-LABEL: name: test_subreg_spill_fold2 +# Similar to test_subreg_spill_fold, but with a vreg0 register class not containing %WZR. +name: test_subreg_spill_fold2 +registers: + - { id: 0, class: gpr64sp } +body: | + bb.0: + ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0) + undef %0.sub_32 = COPY %wzr + INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp + %x0 = ADDXri %0, 1, 0 + RET_ReallyLR implicit %x0 +... +--- +# CHECK-LABEL: name: test_subreg_spill_fold3 +# Similar to test_subreg_spill_fold, but with a cross register class copy. +name: test_subreg_spill_fold3 +registers: + - { id: 0, class: fpr64 } +body: | + bb.0: + ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0) + undef %0.ssub = COPY %wzr + INLINEASM $nop, 1, 12, implicit-def dead %d0, 12, implicit-def dead %d1, 12, implicit-def dead %d2, 12, implicit-def dead %d3, 12, implicit-def dead %d4, 12, implicit-def dead %d5, 12, implicit-def dead %d6, 12, implicit-def dead %d7, 12, implicit-def dead %d8, 12, implicit-def dead %d9, 12, implicit-def dead %d10, 12, implicit-def dead %d11, 12, implicit-def dead %d12, 12, implicit-def dead %d13, 12, implicit-def dead %d14, 12, implicit-def dead %d15, 12, implicit-def dead %d16, 12, implicit-def dead %d17, 12, implicit-def dead %d18, 12, implicit-def dead %d19, 12, implicit-def dead %d20, 12, implicit-def dead %d21, 12, implicit-def dead %d22, 12, implicit-def dead %d23, 12, implicit-def dead %d24, 12, implicit-def dead %d25, 12, implicit-def dead %d26, 12, implicit-def dead %d27, 12, implicit-def dead %d28, 12, implicit-def dead %d29, 12, implicit-def dead %d30, 12, implicit-def %d31 + %x0 = COPY %0 + RET_ReallyLR implicit %x0 +... +--- +# CHECK-LABEL: name: test_subreg_fill_fold +# Ensure that the filled COPY is eliminated and folded into the fill load. +name: test_subreg_fill_fold +registers: + - { id: 0, class: gpr32 } + - { id: 1, class: gpr64 } +body: | + bb.0: + %0 = COPY %wzr + INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp + ; CHECK: undef %1.sub_32 = LDRWui %stack.0, 0 :: (load 4 from %stack.0) + undef %1.sub_32 = COPY %0 + %x0 = COPY %1 + RET_ReallyLR implicit %x0 +... +--- +# CHECK-LABEL: name: test_subreg_fill_fold2 +# Similar to test_subreg_fill_fold, but with a cross-class copy. +name: test_subreg_fill_fold2 +registers: + - { id: 0, class: gpr32 } + - { id: 1, class: fpr64 } +body: | + bb.0: + %0 = COPY %wzr + INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp + ; CHECK: undef %1.ssub = LDRSui %stack.0, 0 :: (load 4 from %stack.0) + undef %1.ssub = COPY %0 + %d0 = COPY %1 + RET_ReallyLR implicit %d0 +... diff --git a/test/CodeGen/MIR/X86/basic-block-liveins.mir b/test/CodeGen/MIR/X86/basic-block-liveins.mir index 35f5512936b..b347368a94b 100644 --- a/test/CodeGen/MIR/X86/basic-block-liveins.mir +++ b/test/CodeGen/MIR/X86/basic-block-liveins.mir @@ -22,7 +22,8 @@ ... --- -name: test +name: test +tracksRegLiveness: true body: | ; CHECK-LABEL: bb.0.body: ; CHECK-NEXT: liveins: %edi, %esi @@ -33,7 +34,8 @@ body: | RETQ %eax ... --- -name: test2 +name: test2 +tracksRegLiveness: true body: | ; CHECK-LABEL: name: test2 ; Verify that we can have multiple lists of liveins that will be merged into @@ -48,7 +50,8 @@ body: | RETQ %eax ... --- -name: test3 +name: test3 +tracksRegLiveness: true body: | ; Verify that we can have an empty list of liveins. ; CHECK-LABEL: name: test3 diff --git a/test/CodeGen/MIR/X86/machine-verifier.mir b/test/CodeGen/MIR/X86/machine-verifier.mir index c56bab8c998..7421146c22e 100644 --- a/test/CodeGen/MIR/X86/machine-verifier.mir +++ b/test/CodeGen/MIR/X86/machine-verifier.mir @@ -10,7 +10,8 @@ ... --- -name: inc +name: inc +tracksRegLiveness: true body: | bb.0.entry: liveins: %edi diff --git a/test/CodeGen/NVPTX/tid-range.ll b/test/CodeGen/NVPTX/tid-range.ll new file mode 100644 index 00000000000..3dc4008810a --- /dev/null +++ b/test/CodeGen/NVPTX/tid-range.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -march=nvptx64 | FileCheck %s +declare i32 @get_register() + +define i1 @test1() { +entry: + %call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !0 + %cmp = icmp eq i32 %call, 1 + ret i1 %cmp +} + +; CHECK-LABEL: test1( +; CHECK: setp.eq.s32 %p1, %r1, 1; +; CHECK: selp.u32 %[[R:.+]], 1, 0, %p1; +; CHECK: st.param.b32 [func_retval0+0], %[[R]]; + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() + +!0 = !{ i32 0, i32 3 } diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll index 6cda38aa94f..425d2609380 100644 --- a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll +++ b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll @@ -24,7 +24,7 @@ define void @test_void_return() { ; CHECK-NEXT: hasVAStart: false ; CHECK-NEXT: hasMustTailInVarArgFunc: false ; CHECK-NEXT: body: -; CHECK-NEXT: bb.1: +; CHECK-NEXT: bb.1.entry: ; CHECK-NEXT: RET 0 entry: ret void diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 3c649e18bc3..8590d641a4c 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2902,6 +2902,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, < define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) { ; CHECK-LABEL: test_mask_vextracti64x4: ; CHECK: ## BB#0: +; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: kshiftlw $12, %k1, %k0 ; CHECK-NEXT: kshiftrw $15, %k0, %k0 @@ -2923,7 +2924,7 @@ define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) ; CHECK-NEXT: vpmovsxdq %xmm2, %ymm2 ; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask) + %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 1, <4 x i64> %b, i8 %mask) ret <4 x i64> %res } @@ -2963,9 +2964,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x define <4 x double> @test_vextractf64x4(<8 x double> %a) { ; CHECK-LABEL: test_vextractf64x4: ; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; CHECK-NEXT: retq - %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1) + %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1) ret <4 x double> %res } diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll index 646697b82c2..04d21ecd3e8 100644 --- a/test/CodeGen/X86/avx512-trunc.ll +++ b/test/CodeGen/X86/avx512-trunc.ll @@ -500,3 +500,110 @@ define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 { store <8 x i8> %x, <8 x i8>* %res ret void } + + +define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: usat_trunc_wb_256_mem: +; KNL: ## BB#0: +; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vmovdqu %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: usat_trunc_wb_256_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovuswb %ymm0, (%rdi) +; SKX-NEXT: retq + %x3 = icmp ult <16 x i16> %i, + %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) { +; KNL-LABEL: usat_trunc_wb_256: +; KNL: ## BB#0: +; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: usat_trunc_wb_256: +; SKX: ## BB#0: +; SKX-NEXT: vpmovuswb %ymm0, %xmm0 +; SKX-NEXT: retq + %x3 = icmp ult <16 x i16> %i, + %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + ret <16 x i8> %x6 +} + +define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) { +; KNL-LABEL: usat_trunc_wb_128_mem: +; KNL: ## BB#0: +; KNL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: usat_trunc_wb_128_mem: +; SKX: ## BB#0: +; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: retq + %x3 = icmp ult <8 x i16> %i, + %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> + %x6 = trunc <8 x i16> %x5 to <8 x i8> + store <8 x i8> %x6, <8 x i8>* %res, align 1 + ret void +} + +define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) { +; ALL-LABEL: usat_trunc_db_512_mem: +; ALL: ## BB#0: +; ALL-NEXT: vpmovusdb %zmm0, (%rdi) +; ALL-NEXT: retq + %x3 = icmp ult <16 x i32> %i, + %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> + %x6 = trunc <16 x i32> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) { +; ALL-LABEL: usat_trunc_qb_512_mem: +; ALL: ## BB#0: +; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: retq + %x3 = icmp ult <8 x i64> %i, + %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> + %x6 = trunc <8 x i64> %x5 to <8 x i8> + store <8 x i8> %x6, <8 x i8>* %res, align 1 + ret void +} + +define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) { +; ALL-LABEL: usat_trunc_qd_512_mem: +; ALL: ## BB#0: +; ALL-NEXT: vpmovusqd %zmm0, (%rdi) +; ALL-NEXT: retq + %x3 = icmp ult <8 x i64> %i, + %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> + %x6 = trunc <8 x i64> %x5 to <8 x i32> + store <8 x i32> %x6, <8 x i32>* %res, align 1 + ret void +} + +define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) { +; ALL-LABEL: usat_trunc_qw_512_mem: +; ALL: ## BB#0: +; ALL-NEXT: vpmovusqw %zmm0, (%rdi) +; ALL-NEXT: retq + %x3 = icmp ult <8 x i64> %i, + %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> + %x6 = trunc <8 x i64> %x5 to <8 x i16> + store <8 x i16> %x6, <8 x i16>* %res, align 1 + ret void +} + diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll index a961dbac7dd..8e9bc8b5af4 100644 --- a/test/CodeGen/X86/cmov.ll +++ b/test/CodeGen/X86/cmov.ll @@ -156,3 +156,21 @@ define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind { %d = select i1 %c, i8 %a, i8 %b ret i8 %d } + +; FIXME: The 'not' is redundant. + +define i32 @smin(i32 %x) { +; CHECK-LABEL: smin: +; CHECK: ## BB#0: +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: notl %ecx +; CHECK-NEXT: xorl $-1, %edi +; CHECK-NEXT: movl $-1, %eax +; CHECK-NEXT: cmovsl %ecx, %eax +; CHECK-NEXT: retq + %not_x = xor i32 %x, -1 + %1 = icmp slt i32 %not_x, -1 + %sel = select i1 %1, i32 %not_x, i32 -1 + ret i32 %sel +} + diff --git a/test/CodeGen/X86/lower-vec-shift-2.ll b/test/CodeGen/X86/lower-vec-shift-2.ll index 6ca76c2e7e4..a617f44d3f9 100644 --- a/test/CodeGen/X86/lower-vec-shift-2.ll +++ b/test/CodeGen/X86/lower-vec-shift-2.ll @@ -12,8 +12,7 @@ define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) { ; ; AVX-LABEL: test1: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -32,8 +31,7 @@ define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { ; ; AVX-LABEL: test2: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -68,8 +66,7 @@ define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) { ; ; AVX-LABEL: test4: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -88,8 +85,7 @@ define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) { ; ; AVX-LABEL: test5: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -124,8 +120,7 @@ define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) { ; ; AVX-LABEL: test7: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -144,8 +139,7 @@ define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) { ; ; AVX-LABEL: test8: ; AVX: # BB#0: # %entry -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq entry: diff --git a/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/test/CodeGen/X86/shuffle-vs-trunc-128.ll new file mode 100644 index 00000000000..f9fe97b21ee --- /dev/null +++ b/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -0,0 +1,481 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL + +; PR31551 +; Pairs of shufflevector:trunc functions with functional equivalence. +; Ideally, the shuffles should be lowered to code with the same quality as the truncates. + +define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { +; AVX-LABEL: shuffle_v16i8_to_v8i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v16i8_to_v8i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i8_to_v8i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i8_to_v8i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { +; AVX-LABEL: trunc_v8i16_to_v8i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v8i16_to_v8i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i16_to_v8i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i16_to_v8i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %bc = bitcast <16 x i8> %vec to <8 x i16> + %strided.vec = trunc <8 x i16> %bc to <8 x i8> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { +; AVX-LABEL: shuffle_v8i16_to_v4i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v8i16_to_v4i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i16_to_v4i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v8i16_to_v4i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <8 x i16>, <8 x i16>* %L + %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> + store <4 x i16> %strided.vec, <4 x i16>* %S + ret void +} + +define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { +; AVX-LABEL: trunc_v4i32_to_v4i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i32_to_v4i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i32_to_v4i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i32_to_v4i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <8 x i16>, <8 x i16>* %L + %bc = bitcast <8 x i16> %vec to <4 x i32> + %strided.vec = trunc <4 x i32> %bc to <4 x i16> + store <4 x i16> %strided.vec, <4 x i16>* %S + ret void +} + +define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { +; AVX-LABEL: shuffle_v4i32_to_v2i32: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v4i32_to_v2i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i32_to_v2i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v4i32_to_v2i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <4 x i32>, <4 x i32>* %L + %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> + store <2 x i32> %strided.vec, <2 x i32>* %S + ret void +} + +define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { +; AVX-LABEL: trunc_v2i64_to_v2i32: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v2i64_to_v2i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v2i64_to_v2i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v2i64_to_v2i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <4 x i32>, <4 x i32>* %L + %bc = bitcast <4 x i32> %vec to <2 x i64> + %strided.vec = trunc <2 x i64> %bc to <2 x i32> + store <2 x i32> %strided.vec, <2 x i32>* %S + ret void +} + +define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { +; AVX-LABEL: shuffle_v16i8_to_v4i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v16i8_to_v4i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i8_to_v4i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i8_to_v4i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> + store <4 x i8> %strided.vec, <4 x i8>* %S + ret void +} + +define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { +; AVX-LABEL: trunc_v4i32_to_v4i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i32_to_v4i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i32_to_v4i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i32_to_v4i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %bc = bitcast <16 x i8> %vec to <4 x i32> + %strided.vec = trunc <4 x i32> %bc to <4 x i8> + store <4 x i8> %strided.vec, <4 x i8>* %S + ret void +} + +define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { +; AVX-LABEL: shuffle_v8i16_to_v2i16: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v8i16_to_v2i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i16_to_v2i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v8i16_to_v2i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <8 x i16>, <8 x i16>* %L + %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> + store <2 x i16> %strided.vec, <2 x i16>* %S + ret void +} + +define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { +; AVX-LABEL: trunc_v2i64_to_v2i16: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v2i64_to_v2i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v2i64_to_v2i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v2i64_to_v2i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <8 x i16>, <8 x i16>* %L + %bc = bitcast <8 x i16> %vec to <2 x i64> + %strided.vec = trunc <2 x i64> %bc to <2 x i16> + store <2 x i16> %strided.vec, <2 x i16>* %S + ret void +} + +define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind { +; AVX-LABEL: shuffle_v16i8_to_v2i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v16i8_to_v2i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i8_to_v2i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i8_to_v2i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> + store <2 x i8> %strided.vec, <2 x i8>* %S + ret void +} + +define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind { +; AVX-LABEL: trunc_v2i64_to_v2i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v2i64_to_v2i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v2i64_to_v2i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v2i64_to_v2i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i8>, <16 x i8>* %L + %bc = bitcast <16 x i8> %vec to <2 x i64> + %strided.vec = trunc <2 x i64> %bc to <2 x i8> + store <2 x i8> %strided.vec, <2 x i8>* %S + ret void +} diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll new file mode 100644 index 00000000000..893f96e6fb2 --- /dev/null +++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -0,0 +1,629 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL + +; PR31551 +; Pairs of shufflevector:trunc functions with functional equivalence. +; Ideally, the shuffles should be lowered to code with the same quality as the truncates. + +define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { +; AVX-LABEL: shuffle_v32i8_to_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v32i8_to_v16i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> + store <16 x i8> %strided.vec, <16 x i8>* %S + ret void +} + +define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { +; AVX-LABEL: trunc_v16i16_to_v16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v16i16_to_v16i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v16i16_to_v16i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0 +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v16i16_to_v16i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %bc = bitcast <32 x i8> %vec to <16 x i16> + %strided.vec = trunc <16 x i16> %bc to <16 x i8> + store <16 x i8> %strided.vec, <16 x i8>* %S + ret void +} + +define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { +; AVX-LABEL: shuffle_v16i16_to_v8i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v16i16_to_v8i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i16>, <16 x i16>* %L + %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> + store <8 x i16> %strided.vec, <8 x i16>* %S + ret void +} + +define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { +; AVX-LABEL: trunc_v8i32_to_v8i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v8i32_to_v8i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_to_v8i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_to_v8i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i16>, <16 x i16>* %L + %bc = bitcast <16 x i16> %vec to <8 x i32> + %strided.vec = trunc <8 x i32> %bc to <8 x i16> + store <8 x i16> %strided.vec, <8 x i16>* %S + ret void +} + +define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { +; AVX-LABEL: shuffle_v8i32_to_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v8i32_to_v4i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512F-NEXT: vmovaps %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_to_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512VL-NEXT: vmovaps %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v8i32_to_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512BWVL-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <8 x i32>, <8 x i32>* %L + %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> + store <4 x i32> %strided.vec, <4 x i32>* %S + ret void +} + +define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { +; AVX-LABEL: trunc_v4i64_to_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <8 x i32>, <8 x i32>* %L + %bc = bitcast <8 x i32> %vec to <4 x i64> + %strided.vec = trunc <4 x i64> %bc to <4 x i32> + store <4 x i32> %strided.vec, <4 x i32>* %S + ret void +} + +define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { +; AVX-LABEL: shuffle_v32i8_to_v8i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v32i8_to_v8i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { +; AVX-LABEL: trunc_v8i32_to_v8i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v8i32_to_v8i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v8i32_to_v8i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v8i32_to_v8i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %bc = bitcast <32 x i8> %vec to <8 x i32> + %strided.vec = trunc <8 x i32> %bc to <8 x i8> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { +; AVX-LABEL: shuffle_v16i16_to_v4i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v16i16_to_v4i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i16>, <16 x i16>* %L + %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> + store <4 x i16> %strided.vec, <4 x i16>* %S + ret void +} + +define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { +; AVX-LABEL: trunc_v4i64_to_v4i16: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <16 x i16>, <16 x i16>* %L + %bc = bitcast <16 x i16> %vec to <4 x i64> + %strided.vec = trunc <4 x i64> %bc to <4 x i16> + store <4 x i16> %strided.vec, <4 x i16>* %S + ret void +} + +define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { +; AVX-LABEL: shuffle_v32i8_to_v4i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: shuffle_v32i8_to_v4i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> + store <4 x i8> %strided.vec, <4 x i8>* %S + ret void +} + +define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { +; AVX-LABEL: trunc_v4i64_to_v4i8: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512F-LABEL: trunc_v4i64_to_v4i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_to_v4i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_to_v4i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <32 x i8>, <32 x i8>* %L + %bc = bitcast <32 x i8> %vec to <4 x i64> + %strided.vec = trunc <4 x i64> %bc to <4 x i8> + store <4 x i8> %strided.vec, <4 x i8>* %S + ret void +} diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll new file mode 100644 index 00000000000..923290411ae --- /dev/null +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -0,0 +1,537 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL + +; PR31551 +; Pairs of shufflevector:trunc functions with functional equivalence. +; Ideally, the shuffles should be lowered to code with the same quality as the truncates. + +define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { +; AVX512F-LABEL: shuffle_v64i8_to_v32i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v64i8_to_v32i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> + store <32 x i8> %strided.vec, <32 x i8>* %S + ret void +} + +define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { +; AVX512F-LABEL: trunc_v32i16_to_v32i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovsxwd 32(%rdi), %zmm1 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_v32i16_to_v32i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0 +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovsxwd 32(%rdi), %zmm1 +; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_v32i16_to_v32i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %bc = bitcast <64 x i8> %vec to <32 x i16> + %strided.vec = trunc <32 x i16> %bc to <32 x i8> + store <32 x i8> %strided.vec, <32 x i8>* %S + ret void +} + +define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { +; AVX512F-LABEL: shuffle_v32i16_to_v16i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i16_to_v16i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i16_to_v16i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30] +; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 +; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <32 x i16>, <32 x i16>* %L + %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> + store <16 x i16> %strided.vec, <16 x i16>* %S + ret void +} + +define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { +; AVX512-LABEL: trunc_v16i32_to_v16i16: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <32 x i16>, <32 x i16>* %L + %bc = bitcast <32 x i16> %vec to <16 x i32> + %strided.vec = trunc <16 x i32> %bc to <16 x i16> + store <16 x i16> %strided.vec, <16 x i16>* %S + ret void +} + +define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { +; AVX512-LABEL: shuffle_v16i32_to_v8i32: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <16 x i32>, <16 x i32>* %L + %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> + store <8 x i32> %strided.vec, <8 x i32>* %S + ret void +} + +define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { +; AVX512-LABEL: trunc_v8i64_to_v8i32: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <16 x i32>, <16 x i32>* %L + %bc = bitcast <16 x i32> %vec to <8 x i64> + %strided.vec = trunc <8 x i64> %bc to <8 x i32> + store <8 x i32> %strided.vec, <8 x i32>* %S + ret void +} + +define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { +; AVX512F-LABEL: shuffle_v64i8_to_v16i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v64i8_to_v16i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v64i8_to_v16i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax +; AVX512BW-NEXT: vpextrb $0, %xmm0, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm1 +; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax +; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vpextrb $4, %xmm0, %eax +; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %ecx +; AVX512BWVL-NEXT: vmovd %ecx, %xmm1 +; AVX512BWVL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $4, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $12, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $4, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $12, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $4, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> + store <16 x i8> %strided.vec, <16 x i8>* %S + ret void +} + +define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { +; AVX512-LABEL: trunc_v16i32_to_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %bc = bitcast <64 x i8> %vec to <16 x i32> + %strided.vec = trunc <16 x i32> %bc to <16 x i8> + store <16 x i8> %strided.vec, <16 x i8>* %S + ret void +} + +define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { +; AVX512F-LABEL: shuffle_v32i16_to_v8i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v32i16_to_v8i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v32i16_to_v8i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; AVX512BW-NEXT: vmovd %xmm2, %eax +; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax +; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512BW-NEXT: vmovd %xmm2, %eax +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax +; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; AVX512BWVL-NEXT: vmovd %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrw $4, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512BWVL-NEXT: vmovd %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrw $4, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <32 x i16>, <32 x i16>* %L + %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> + store <8 x i16> %strided.vec, <8 x i16>* %S + ret void +} + +define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { +; AVX512-LABEL: trunc_v8i64_to_v8i16: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <32 x i16>, <32 x i16>* %L + %bc = bitcast <32 x i16> %vec to <8 x i64> + %strided.vec = trunc <8 x i64> %bc to <8 x i16> + store <8 x i16> %strided.vec, <8 x i16>* %S + ret void +} + +define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { +; AVX512F-LABEL: shuffle_v64i8_to_v8i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v64i8_to_v8i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v64i8_to_v8i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; AVX512BW-NEXT: vpextrb $8, %xmm1, %r8d +; AVX512BW-NEXT: vpextrb $0, %xmm1, %r9d +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512BW-NEXT: vpextrb $8, %xmm1, %r10d +; AVX512BW-NEXT: vpextrb $0, %xmm1, %r11d +; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm1 +; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax +; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx +; AVX512BW-NEXT: vpextrb $8, %xmm0, %edx +; AVX512BW-NEXT: vpextrb $0, %xmm0, %edi +; AVX512BW-NEXT: vpinsrb $0, %edi, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $4, %r11d, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax +; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax +; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512BWVL-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} + +define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { +; AVX512-LABEL: trunc_v8i64_to_v8i8: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: retq + %vec = load <64 x i8>, <64 x i8>* %L + %bc = bitcast <64 x i8> %vec to <8 x i64> + %strided.vec = trunc <8 x i64> %bc to <8 x i8> + store <8 x i8> %strided.vec, <8 x i8>* %S + ret void +} diff --git a/test/CodeGen/X86/tail-call-conditional.mir b/test/CodeGen/X86/tail-call-conditional.mir index af6e95d4610..75cb1e451d8 100644 --- a/test/CodeGen/X86/tail-call-conditional.mir +++ b/test/CodeGen/X86/tail-call-conditional.mir @@ -26,7 +26,8 @@ ... --- -name: test +name: test +tracksRegLiveness: true liveins: - { reg: '%rdi' } - { reg: '%rsi' } diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll index 50febd4c1ec..fbb67ebbf60 100644 --- a/test/CodeGen/X86/vector-rotate-128.ll +++ b/test/CodeGen/X86/vector-rotate-128.ll @@ -87,14 +87,12 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm3, %xmm4 -; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm1, %xmm3 ; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero ; X32-SSE-NEXT: psrlq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; X32-SSE-NEXT: orpd %xmm4, %xmm1 diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll index fc67914015b..27b65b82992 100644 --- a/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -5,7 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Just one 32-bit run to make sure we do reasonable things for i64 shifts. @@ -80,7 +80,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: var_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX512-NEXT: vpsrlvq %xmm1, %xmm2, %xmm3 ; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -90,20 +90,19 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; X32-SSE-LABEL: var_shift_v2i64: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: psrlq %xmm2, %xmm4 -; X32-SSE-NEXT: movq {{.*#+}} xmm5 = xmm1[0],zero -; X32-SSE-NEXT: psrlq %xmm5, %xmm3 -; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlq %xmm2, %xmm1 -; X32-SSE-NEXT: psrlq %xmm5, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X32-SSE-NEXT: xorpd %xmm4, %xmm1 -; X32-SSE-NEXT: psubq %xmm4, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X32-SSE-NEXT: movdqa %xmm2, %xmm4 +; X32-SSE-NEXT: psrlq %xmm3, %xmm4 +; X32-SSE-NEXT: psrlq %xmm1, %xmm2 +; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrlq %xmm3, %xmm2 +; X32-SSE-NEXT: psrlq %xmm1, %xmm0 +; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; X32-SSE-NEXT: xorpd %xmm4, %xmm2 +; X32-SSE-NEXT: psubq %xmm4, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i64> %a, %b ret <2 x i64> %shift @@ -189,7 +188,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -323,11 +322,11 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: var_shift_v8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v8i16: @@ -499,7 +498,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: var_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] @@ -627,7 +626,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX512-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 @@ -637,7 +636,6 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; X32-SSE-LABEL: splatvar_shift_v2i64: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] ; X32-SSE-NEXT: psrlq %xmm1, %xmm2 ; X32-SSE-NEXT: psrlq %xmm1, %xmm0 @@ -659,29 +657,25 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; SSE41-LABEL: splatvar_shift_v4i32: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; SSE41-NEXT: psrad %xmm2, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: psrad %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_shift_v4i32: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatvar_shift_v4i32: ; XOP: # BB#0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i32: -; AVX512: ## BB#0: -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -706,29 +700,25 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; SSE41-LABEL: splatvar_shift_v8i16: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; SSE41-NEXT: psraw %xmm2, %xmm0 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psraw %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_shift_v8i16: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatvar_shift_v8i16: ; XOP: # BB#0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -919,7 +909,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] @@ -1066,7 +1056,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: constant_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -1150,7 +1140,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1232,11 +1222,11 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: constant_shift_v8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i16: @@ -1393,7 +1383,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: constant_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] @@ -1528,7 +1518,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrad $7, %xmm0, %xmm1 ; AVX512-NEXT: vpsrlq $7, %xmm0, %xmm0 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] @@ -1564,7 +1554,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1593,7 +1583,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1632,7 +1622,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll index 5725fcb8c12..ee1879b6696 100644 --- a/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Variable Shifts ; @@ -66,7 +66,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 ; AVX512-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 ; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -131,7 +131,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = ashr <8 x i32> %a, %b @@ -213,11 +213,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift @@ -332,7 +332,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] @@ -411,7 +411,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 ; AVX512-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 @@ -426,9 +426,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX1-LABEL: splatvar_shift_v8i32: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -436,16 +435,14 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; ; AVX2-LABEL: splatvar_shift_v8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v8i32: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; XOPAVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -453,15 +450,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; ; XOPAVX2-LABEL: splatvar_shift_v8i32: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; XOPAVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v8i32: -; AVX512: ## BB#0: -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer @@ -473,8 +468,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX1-LABEL: splatvar_shift_v16i16: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpextrw $0, %xmm1, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -482,16 +476,14 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; ; AVX2-LABEL: splatvar_shift_v16i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpextrw $0, %xmm1, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v16i16: ; XOPAVX1: # BB#0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpextrw $0, %xmm1, %eax -; XOPAVX1-NEXT: vmovd %eax, %xmm1 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOPAVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -499,15 +491,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; ; XOPAVX2-LABEL: splatvar_shift_v16i16: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpextrw $0, %xmm1, %eax -; XOPAVX2-NEXT: vmovd %eax, %xmm1 +; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOPAVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: vpextrw $0, %xmm1, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer @@ -619,7 +609,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] @@ -702,7 +692,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2] ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 @@ -750,7 +740,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = ashr <8 x i32> %a, @@ -815,11 +805,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -924,7 +914,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] @@ -998,7 +988,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrad $7, %ymm0, %ymm1 ; AVX512-NEXT: vpsrlq $7, %ymm0, %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] @@ -1035,7 +1025,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrad $5, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = ashr <8 x i32> %a, @@ -1070,7 +1060,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v16i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsraw $3, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = ashr <16 x i16> %a, @@ -1123,7 +1113,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll index 27ff134fd10..1280641c557 100644 --- a/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Variable Shifts ; define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: var_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsravq %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = ashr <8 x i64> %a, %b @@ -16,7 +16,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { ; ALL-LABEL: var_shift_v16i32: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = ashr <16 x i32> %a, %b @@ -25,7 +25,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] @@ -48,7 +48,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <32 x i16> %a, %b @@ -57,7 +57,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] @@ -109,100 +109,100 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2 ; AVX512BW-NEXT: vpextrb $1, %xmm2, %ecx ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3 ; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $3, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $5, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $6, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $7, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $9, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $10, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $11, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $14, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $15, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2 @@ -212,11 +212,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: movzbl %dl, %ecx @@ -224,85 +224,85 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 @@ -310,17 +310,17 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx ; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm4, %esi -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %sil ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm5 @@ -328,89 +328,89 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: movzbl %sil, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %sil ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax @@ -418,86 +418,86 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 @@ -514,7 +514,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: splatvar_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsraq %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer @@ -524,9 +524,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { ; ALL-LABEL: splatvar_shift_v16i32: -; ALL: ## BB#0: -; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; ALL: # BB#0: +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; ALL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer @@ -536,17 +535,15 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v32i16: -; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vpextrw $0, %xmm2, %eax -; AVX512DQ-NEXT: vmovd %eax, %xmm2 +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-NEXT: vpsraw %xmm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsraw %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v32i16: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpextrw $0, %xmm1, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer @@ -556,7 +553,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] @@ -602,101 +599,101 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm3 ; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2 @@ -706,11 +703,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: movzbl %dl, %ecx @@ -718,85 +715,85 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $3, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $7, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $11, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 @@ -804,17 +801,17 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax ; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpextrb $2, %xmm3, %esi ; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %sil ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm5 @@ -822,89 +819,89 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: movzbl %sil, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $4, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $8, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $12, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi ; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %sil ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax @@ -912,86 +909,86 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %dl ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: sarb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 @@ -1009,7 +1006,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { ; ALL-LABEL: constant_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsravq {{.*}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq %shift = ashr <8 x i64> %a, @@ -1018,7 +1015,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { ; ALL-LABEL: constant_shift_v16i32: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq %shift = ashr <16 x i32> %a, @@ -1027,7 +1024,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] @@ -1049,7 +1046,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <32 x i16> %a, @@ -1058,7 +1055,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] @@ -1104,7 +1101,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax ; AVX512BW-NEXT: vmovd %eax, %xmm2 @@ -1362,7 +1359,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { ; ALL-LABEL: splatconstant_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsraq $7, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = ashr <8 x i64> %a, @@ -1371,7 +1368,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { ; ALL-LABEL: splatconstant_shift_v16i32: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsrad $5, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = ashr <16 x i32> %a, @@ -1380,13 +1377,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v32i16: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsraw $3, %ymm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <32 x i16> %a, @@ -1395,7 +1392,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1409,7 +1406,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] @@ -1422,14 +1419,14 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { define <64 x i8> @ashr_const7_v64i8(<64 x i8> %a) { ; AVX512DQ-LABEL: ashr_const7_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: ashr_const7_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll index 0dab815d4d4..42488f2ec3a 100644 --- a/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -5,7 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Just one 32-bit run to make sure we do reasonable things for i64 shifts. ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 @@ -60,7 +60,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -69,7 +69,6 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psrlq %xmm3, %xmm2 -; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; X32-SSE-NEXT: psrlq %xmm1, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; X32-SSE-NEXT: movapd %xmm2, %xmm0 @@ -158,7 +157,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -292,11 +291,11 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: var_shift_v8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v8i16: @@ -417,7 +416,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: var_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm2 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 @@ -487,13 +486,12 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i64: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; X32-SSE-NEXT: psrlq %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -511,29 +509,25 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; SSE41-LABEL: splatvar_shift_v4i32: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; SSE41-NEXT: psrld %xmm2, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: psrld %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_shift_v4i32: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatvar_shift_v4i32: ; XOP: # BB#0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i32: -; AVX512: ## BB#0: -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -558,29 +552,25 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; SSE41-LABEL: splatvar_shift_v8i16: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; SSE41-NEXT: psrlw %xmm2, %xmm0 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psrlw %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_shift_v8i16: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatvar_shift_v8i16: ; XOP: # BB#0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -709,7 +699,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm2 @@ -810,7 +800,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -884,7 +874,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -966,11 +956,11 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: constant_shift_v8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i16: @@ -1073,7 +1063,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: constant_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm2 @@ -1145,7 +1135,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlq $7, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1174,7 +1164,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1203,7 +1193,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1236,7 +1226,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll index 09822ee6c61..5223d7bba35 100644 --- a/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Variable Shifts ; @@ -47,7 +47,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = lshr <4 x i64> %a, %b @@ -108,7 +108,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = lshr <8 x i32> %a, %b @@ -190,11 +190,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift @@ -276,7 +276,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 @@ -326,7 +326,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer @@ -337,9 +337,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX1-LABEL: splatvar_shift_v8i32: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -347,16 +346,14 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; ; AVX2-LABEL: splatvar_shift_v8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v8i32: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; XOPAVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -364,15 +361,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; ; XOPAVX2-LABEL: splatvar_shift_v8i32: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; XOPAVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v8i32: -; AVX512: ## BB#0: -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer @@ -384,8 +379,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX1-LABEL: splatvar_shift_v16i16: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpextrw $0, %xmm1, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -393,16 +387,14 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; ; AVX2-LABEL: splatvar_shift_v16i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpextrw $0, %xmm1, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v16i16: ; XOPAVX1: # BB#0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpextrw $0, %xmm1, %eax -; XOPAVX1-NEXT: vmovd %eax, %xmm1 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -410,15 +402,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; ; XOPAVX2-LABEL: splatvar_shift_v16i16: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpextrw $0, %xmm1, %eax -; XOPAVX2-NEXT: vmovd %eax, %xmm1 +; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOPAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: vpextrw $0, %xmm1, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer @@ -501,7 +491,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 @@ -560,7 +550,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = lshr <4 x i64> %a, @@ -605,7 +595,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = lshr <8 x i32> %a, @@ -670,11 +660,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -750,7 +740,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2 @@ -801,7 +791,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlq $7, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = lshr <4 x i64> %a, @@ -836,7 +826,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrld $5, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = lshr <8 x i32> %a, @@ -871,7 +861,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v16i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = lshr <16 x i16> %a, @@ -913,7 +903,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll index 06bf12a621a..4c3caf329fb 100644 --- a/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Variable Shifts @@ -8,7 +8,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: var_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = lshr <8 x i64> %a, %b @@ -17,7 +17,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { ; ALL-LABEL: var_shift_v16i32: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = lshr <16 x i32> %a, %b @@ -26,7 +26,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] @@ -49,7 +49,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = lshr <32 x i16> %a, %b @@ -58,7 +58,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 @@ -89,100 +89,100 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2 ; AVX512BW-NEXT: vpextrb $1, %xmm2, %ecx ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3 ; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $3, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $5, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $6, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $7, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $9, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $10, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $11, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $14, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $15, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2 @@ -192,11 +192,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: movzbl %dl, %ecx @@ -204,85 +204,85 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 @@ -290,17 +290,17 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx ; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm4, %esi -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %sil ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm5 @@ -308,89 +308,89 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: movzbl %sil, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %sil ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax @@ -398,86 +398,86 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 @@ -494,7 +494,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: splatvar_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer @@ -504,9 +504,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { ; ALL-LABEL: splatvar_shift_v16i32: -; ALL: ## BB#0: -; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; ALL: # BB#0: +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer @@ -516,17 +515,15 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v32i16: -; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vpextrw $0, %xmm2, %eax -; AVX512DQ-NEXT: vmovd %eax, %xmm2 +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v32i16: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpextrw $0, %xmm1, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer @@ -536,7 +533,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -565,101 +562,101 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm3 ; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2 @@ -669,11 +666,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: movzbl %dl, %ecx @@ -681,85 +678,85 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $3, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $7, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $11, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 @@ -767,17 +764,17 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax ; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpextrb $2, %xmm3, %esi ; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %sil ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm5 @@ -785,89 +782,89 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: movzbl %sil, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $4, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $8, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $12, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi ; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %sil ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax @@ -875,86 +872,86 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %dl ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shrb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 @@ -972,7 +969,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { ; ALL-LABEL: constant_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsrlvq {{.*}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq %shift = lshr <8 x i64> %a, @@ -981,7 +978,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { ; ALL-LABEL: constant_shift_v16i32: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq %shift = lshr <16 x i32> %a, @@ -990,7 +987,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] @@ -1012,7 +1009,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = lshr <32 x i16> %a, @@ -1021,7 +1018,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -1050,7 +1047,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax ; AVX512BW-NEXT: vmovd %eax, %xmm2 @@ -1308,7 +1305,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { ; ALL-LABEL: splatconstant_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsrlq $7, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = lshr <8 x i64> %a, @@ -1317,7 +1314,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { ; ALL-LABEL: splatconstant_shift_v16i32: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsrld $5, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = lshr <16 x i32> %a, @@ -1326,13 +1323,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v32i16: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = lshr <32 x i16> %a, @@ -1341,7 +1338,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1350,7 +1347,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: retq diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll index ec2e61d3ca0..5c89949e924 100644 --- a/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/test/CodeGen/X86/vector-shift-shl-128.ll @@ -5,7 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Just one 32-bit run to make sure we do reasonable things for i64 shifts. ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 @@ -58,7 +58,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -67,7 +67,6 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psllq %xmm3, %xmm2 -; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; X32-SSE-NEXT: psllq %xmm1, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; X32-SSE-NEXT: movapd %xmm2, %xmm0 @@ -124,7 +123,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -247,11 +246,11 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: var_shift_v8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %XMM1 %XMM1 %ZMM1 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v8i16: @@ -367,7 +366,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: var_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX512-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 @@ -435,13 +434,12 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v2i64: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; X32-SSE-NEXT: psllq %xmm1, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -459,29 +457,25 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; ; SSE41-LABEL: splatvar_shift_v4i32: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; SSE41-NEXT: pslld %xmm2, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pslld %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_shift_v4i32: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatvar_shift_v4i32: ; XOP: # BB#0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i32: -; AVX512: ## BB#0: -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -506,29 +500,25 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; SSE41-LABEL: splatvar_shift_v8i16: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; SSE41-NEXT: psllw %xmm2, %xmm0 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: psllw %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: splatvar_shift_v8i16: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatvar_shift_v8i16: ; XOP: # BB#0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -650,7 +640,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX512-NEXT: vpsllw $4, %xmm0, %xmm2 @@ -747,7 +737,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -802,7 +792,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -838,11 +828,11 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: constant_shift_v8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v8i16: @@ -928,7 +918,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: constant_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512-NEXT: vpsllw $5, %xmm1, %xmm1 ; AVX512-NEXT: vpsllw $4, %xmm0, %xmm2 @@ -998,7 +988,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllq $7, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1027,7 +1017,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpslld $5, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1056,7 +1046,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -1087,7 +1077,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { ; XOP-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll index 00d04063301..eb52ae3ccac 100644 --- a/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/test/CodeGen/X86/vector-shift-shl-256.ll @@ -3,7 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Variable Shifts @@ -45,7 +45,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = shl <4 x i64> %a, %b @@ -89,7 +89,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = shl <8 x i32> %a, %b @@ -165,11 +165,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %YMM1 %YMM1 %ZMM1 -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift @@ -241,7 +241,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: var_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 @@ -290,7 +290,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer @@ -301,9 +301,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX1-LABEL: splatvar_shift_v8i32: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -311,16 +310,14 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; ; AVX2-LABEL: splatvar_shift_v8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v8i32: ; XOPAVX1: # BB#0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; XOPAVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -328,15 +325,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; ; XOPAVX2-LABEL: splatvar_shift_v8i32: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; XOPAVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v8i32: -; AVX512: ## BB#0: -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512-NEXT: vpslld %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer @@ -348,8 +343,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX1-LABEL: splatvar_shift_v16i16: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpextrw $0, %xmm1, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -357,16 +351,14 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; ; AVX2-LABEL: splatvar_shift_v16i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpextrw $0, %xmm1, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v16i16: ; XOPAVX1: # BB#0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpextrw $0, %xmm1, %eax -; XOPAVX1-NEXT: vmovd %eax, %xmm1 +; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOPAVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -374,15 +366,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; ; XOPAVX2-LABEL: splatvar_shift_v16i16: ; XOPAVX2: # BB#0: -; XOPAVX2-NEXT: vpextrw $0, %xmm1, %eax -; XOPAVX2-NEXT: vmovd %eax, %xmm1 +; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; XOPAVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: vpextrw $0, %xmm1, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512: # BB#0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer @@ -457,7 +447,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 @@ -512,7 +502,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = shl <4 x i64> %a, @@ -547,7 +537,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = shl <8 x i32> %a, @@ -582,11 +572,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: retq %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -656,7 +646,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] ; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512-NEXT: vpsllw $4, %ymm0, %ymm2 @@ -706,7 +696,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllq $7, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = shl <4 x i64> %a, @@ -741,7 +731,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpslld $5, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = shl <8 x i32> %a, @@ -776,7 +766,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v16i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0 ; AVX512-NEXT: retq %shift = shl <16 x i16> %a, @@ -817,7 +807,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll index eb1309d9bb0..520c3237a57 100644 --- a/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/test/CodeGen/X86/vector-shift-shl-512.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; ; Variable Shifts @@ -8,7 +8,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: var_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = shl <8 x i64> %a, %b @@ -17,7 +17,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { ; ALL-LABEL: var_shift_v16i32: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = shl <16 x i32> %a, %b @@ -26,7 +26,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] @@ -49,7 +49,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = shl <32 x i16> %a, %b @@ -58,7 +58,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 @@ -86,100 +86,100 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2 ; AVX512BW-NEXT: vpextrb $1, %xmm2, %ecx ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm3 ; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $3, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $5, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $6, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $7, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $9, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $10, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $11, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $14, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $15, %xmm2, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2 @@ -189,11 +189,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: movzbl %dl, %ecx @@ -201,85 +201,85 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 @@ -287,17 +287,17 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx ; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm4, %esi -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %sil ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm5 @@ -305,89 +305,89 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: movzbl %sil, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm4, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm4, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %sil ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax @@ -395,86 +395,86 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 @@ -491,7 +491,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: splatvar_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsllq %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer @@ -501,9 +501,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { ; ALL-LABEL: splatvar_shift_v16i32: -; ALL: ## BB#0: -; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; ALL: # BB#0: +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; ALL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer @@ -513,17 +512,15 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v32i16: -; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vpextrw $0, %xmm2, %eax -; AVX512DQ-NEXT: vmovd %eax, %xmm2 +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v32i16: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpextrw $0, %xmm1, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer @@ -533,7 +530,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] @@ -559,101 +556,101 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm3 ; AVX512BW-NEXT: vpextrb $1, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $0, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $2, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $3, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $4, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $7, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $8, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $11, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $12, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx ; AVX512BW-NEXT: vpextrb $15, %xmm3, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm2 @@ -663,11 +660,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: movzbl %dl, %ecx @@ -675,85 +672,85 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $2, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $3, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $6, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $7, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $10, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $11, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 @@ -761,17 +758,17 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpextrb $1, %xmm3, %eax ; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm4 ; AVX512BW-NEXT: vpextrb $1, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $0, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpextrb $2, %xmm3, %esi ; AVX512BW-NEXT: vpextrb $2, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %sil ; AVX512BW-NEXT: movzbl %dl, %ecx ; AVX512BW-NEXT: vmovd %ecx, %xmm5 @@ -779,89 +776,89 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: movzbl %sil, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $3, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $4, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $5, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $5, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $6, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $7, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $8, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $9, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $9, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $10, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $11, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $12, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax ; AVX512BW-NEXT: vpextrb $13, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $14, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx ; AVX512BW-NEXT: vpextrb $15, %xmm4, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $1, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi ; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %sil ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 ; AVX512BW-NEXT: movzbl %dl, %eax @@ -869,86 +866,86 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vmovd %ecx, %xmm4 ; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $2, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $4, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $5, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $7, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $8, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $10, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $11, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $13, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $14, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %dl ; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: movzbl %dl, %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax ; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512BW-NEXT: ## kill: %CL %CL %ECX +; AVX512BW-NEXT: # kill: %CL %CL %ECX ; AVX512BW-NEXT: shlb %cl, %al ; AVX512BW-NEXT: movzbl %al, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm0 @@ -966,7 +963,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { ; ALL-LABEL: constant_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq %shift = shl <8 x i64> %a, @@ -975,7 +972,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { ; ALL-LABEL: constant_shift_v16i32: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq %shift = shl <16 x i32> %a, @@ -984,14 +981,14 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = shl <32 x i16> %a, @@ -1000,7 +997,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -1026,7 +1023,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax ; AVX512BW-NEXT: vmovd %eax, %xmm2 @@ -1284,7 +1281,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { ; ALL-LABEL: splatconstant_shift_v8i64: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpsllq $7, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = shl <8 x i64> %a, @@ -1293,7 +1290,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { ; ALL-LABEL: splatconstant_shift_v16i32: -; ALL: ## BB#0: +; ALL: # BB#0: ; ALL-NEXT: vpslld $5, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = shl <16 x i32> %a, @@ -1302,13 +1299,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v32i16: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $3, %ymm1, %ymm1 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = shl <32 x i16> %a, @@ -1317,7 +1314,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v64i8: -; AVX512DQ: ## BB#0: +; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1326,7 +1323,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index cad8f85395d..2aab77433df 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -702,17 +702,11 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( ; SSE41-NEXT: pinsrb $5, %edi, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 +; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle @@ -739,17 +733,11 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16( ; SSE41-NEXT: pinsrb $15, %edi, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 +; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle @@ -776,17 +764,11 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( ; SSE41-NEXT: pinsrb $2, %edi, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 +; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 3 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle @@ -1222,19 +1204,12 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00( ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: -; AVX1OR2: # BB#0: # %entry -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] -; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: -; AVX512VL: # BB#0: # %entry -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] -; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -1771,21 +1746,13 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: PR31364: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: PR31364: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] -; AVX512VL-NEXT: retq +; AVX-LABEL: PR31364: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] +; AVX-NEXT: retq %v0 = load i8, i8* %a, align 1 %vecins = insertelement <16 x i8> , i8 %v0, i32 0 %v1 = load i8, i8* %b, align 1 diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 4270d3d216c..3e9e980a197 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1354,19 +1354,12 @@ define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v4i32_z6zz: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_z6zz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v4i32_z6zz: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle } @@ -1683,17 +1676,11 @@ define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v4i32_0z23: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_0z23: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v4i32_0z23: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } @@ -1726,17 +1713,11 @@ define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v4i32_01z3: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_01z3: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v4i32_01z3: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } @@ -1769,17 +1750,11 @@ define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v4i32_012z: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_012z: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v4i32_012z: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } @@ -1812,17 +1787,11 @@ define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v4i32_0zz3: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i32_0zz3: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v4i32_0zz3: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 2421b2b579c..ac9db62d3c1 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1415,17 +1415,11 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { ; SSE-NEXT: pinsrw $1, %edi, %xmm0 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_z8zzzzzz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_z8zzzzzz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_z8zzzzzz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 +; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle @@ -1438,17 +1432,11 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { ; SSE-NEXT: pinsrw $5, %edi, %xmm0 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_zzzzz8zz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_zzzzz8zz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_zzzzz8zz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 +; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle @@ -1461,17 +1449,11 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { ; SSE-NEXT: pinsrw $7, %edi, %xmm0 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_zuuzuuz8: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_zuuzuuz8: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_zuuzuuz8: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 +; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle @@ -1484,17 +1466,11 @@ define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { ; SSE-NEXT: pinsrw $2, %edi, %xmm0 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_zzBzzzzz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_zzBzzzzz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_zzBzzzzz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 +; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 3 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle @@ -2102,17 +2078,11 @@ define <8 x i16> @shuffle_v8i16_0z234567(<8 x i16> %a) { ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_0z234567: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_0z234567: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_0z234567: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle } @@ -2134,17 +2104,11 @@ define <8 x i16> @shuffle_v8i16_0zzzz5z7(<8 x i16> %a) { ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_0zzzz5z7: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_0zzzz5z7: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_0zzzz5z7: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle } @@ -2166,17 +2130,11 @@ define <8 x i16> @shuffle_v8i16_0123456z(<8 x i16> %a) { ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_0123456z: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_0123456z: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_0123456z: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-masked.ll b/test/CodeGen/X86/vector-shuffle-masked.ll index 2837c28a484..04d6b373324 100644 --- a/test/CodeGen/X86/vector-shuffle-masked.ll +++ b/test/CodeGen/X86/vector-shuffle-masked.ll @@ -236,3 +236,453 @@ define <8 x i32> @maskz_shuffle_v8i32_23456701(<8 x i32> %a, i8 %mask) { %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> zeroinitializer ret <8 x i32> %res } + +define <4 x i32> @mask_extract_v16i32_v4i32_0(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16i32_v4i32_0: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti32x4 $0, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru + ret <4 x i32> %res +} + +define <4 x i32> @mask_extract_v16i32_v4i32_1(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16i32_v4i32_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru + ret <4 x i32> %res +} + +define <4 x i32> @mask_extract_v16i32_v4i32_2(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16i32_v4i32_2: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru + ret <4 x i32> %res +} + +define <4 x i32> @mask_extract_v16i32_v4i32_3(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16i32_v4i32_3: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru + ret <4 x i32> %res +} + +define <4 x float> @mask_extract_v16f32_v4f32_0(<16 x float> %a, <4 x float> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16f32_v4f32_0: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x4 $0, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru + ret <4 x float> %res +} + +define <4 x float> @mask_extract_v16f32_v4f32_1(<16 x float> %a, <4 x float> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16f32_v4f32_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru + ret <4 x float> %res +} + +define <4 x float> @mask_extract_v16f32_v4f32_2(<16 x float> %a, <4 x float> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16f32_v4f32_2: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru + ret <4 x float> %res +} + +define <4 x float> @mask_extract_v16f32_v4f32_3(<16 x float> %a, <4 x float> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16f32_v4f32_3: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru + ret <4 x float> %res +} + +define <8 x i32> @mask_extract_v16i32_v8i32_0(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16i32_v8i32_0: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti32x8 $0, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> %passthru + ret <8 x i32> %res +} + +define <8 x i32> @mask_extract_v16i32_v8i32_1(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16i32_v8i32_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti32x8 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> %passthru + ret <8 x i32> %res +} + +define <8 x float> @mask_extract_v16f32_v8f32_0(<16 x float> %a, <8 x float> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16f32_v8f32_0: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x8 $0, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x float> %shuffle, <8 x float> %passthru + ret <8 x float> %res +} + +define <8 x float> @mask_extract_v16f32_v8f32_1(<16 x float> %a, <8 x float> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v16f32_v8f32_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x float> %shuffle, <8 x float> %passthru + ret <8 x float> %res +} + +define <2 x i64> @mask_extract_v8i64_v2i64_0(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8i64_v2i64_0: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti64x2 $0, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru + ret <2 x i64> %res +} + +define <2 x i64> @mask_extract_v8i64_v2i64_1(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8i64_v2i64_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru + ret <2 x i64> %res +} + +define <2 x i64> @mask_extract_v8i64_v2i64_2(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8i64_v2i64_2: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti64x2 $2, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru + ret <2 x i64> %res +} + +define <2 x i64> @mask_extract_v8i64_v2i64_3(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8i64_v2i64_3: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti64x2 $3, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru + ret <2 x i64> %res +} + +define <2 x double> @mask_extract_v8f64_v2f64_0(<8 x double> %a, <2 x double> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8f64_v2f64_0: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf64x2 $0, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru + ret <2 x double> %res +} + +define <2 x double> @mask_extract_v8f64_v2f64_1(<8 x double> %a, <2 x double> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8f64_v2f64_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru + ret <2 x double> %res +} + +define <2 x double> @mask_extract_v8f64_v2f64_2(<8 x double> %a, <2 x double> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8f64_v2f64_2: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf64x2 $2, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru + ret <2 x double> %res +} + +define <2 x double> @mask_extract_v8f64_v2f64_3(<8 x double> %a, <2 x double> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8f64_v2f64_3: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf64x2 $3, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru + ret <2 x double> %res +} + +define <4 x i64> @mask_extract_v8i64_v4i64_0(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8i64_v4i64_0: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti64x4 $0, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle, <4 x i64> %passthru + ret <4 x i64> %res +} + +define <4 x i64> @mask_extract_v8i64_v4i64_1(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8i64_v4i64_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle, <4 x i64> %passthru + ret <4 x i64> %res +} + +define <4 x double> @mask_extract_v8f64_v4f64_0(<8 x double> %a, <4 x double> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8f64_v4f64_0: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf64x4 $0, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x double> %shuffle, <4 x double> %passthru + ret <4 x double> %res +} + +define <4 x double> @mask_extract_v8f64_v4f64_1(<8 x double> %a, <4 x double> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8f64_v4f64_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x double> %shuffle, <4 x double> %passthru + ret <4 x double> %res +} + +define <8 x i32> @mask_extract_v8i64_v8i32_1(<8 x i64> %a, <8 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8i64_v8i32_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti32x8 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> + %shuffle.cast = bitcast <4 x i64> %shuffle to <8 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle.cast, <8 x i32> %passthru + ret <8 x i32> %res +} + +define <8 x float> @mask_extract_v8f64_v8f32_1(<8 x double> %a, <8 x float> %passthru, i8 %mask) { +; CHECK-LABEL: mask_extract_v8f64_v8f32_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> + %shuffle.cast = bitcast <4 x double> %shuffle to <8 x float> + %mask.cast = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %mask.cast, <8 x float> %shuffle.cast, <8 x float> %passthru + ret <8 x float> %res +} + +define <4 x i32> @mask_cast_extract_v8i64_v4i32_1(<8 x i64> %a, <4 x i32> %passthru, i8 %mask) { +; CHECK-LABEL: mask_cast_extract_v8i64_v4i32_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> + %shuffle.cast = bitcast <2 x i64> %shuffle to <4 x i32> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle.cast, <4 x i32> %passthru + ret <4 x i32> %res +} + +define <4 x float> @mask_cast_extract_v8f64_v4f32_1(<8 x double> %a, <4 x float> %passthru, i8 %mask) { +; CHECK-LABEL: mask_cast_extract_v8f64_v4f32_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> + %shuffle.cast = bitcast <2 x double> %shuffle to <4 x float> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x float> %shuffle.cast, <4 x float> %passthru + ret <4 x float> %res +} + +define <4 x i64> @mask_cast_extract_v16i32_v4i64_1(<16 x i32> %a, <4 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: mask_cast_extract_v16i32_v4i64_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> + %shuffle.cast = bitcast <8 x i32> %shuffle to <4 x i64> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle.cast, <4 x i64> %passthru + ret <4 x i64> %res +} + +define <4 x double> @mask_cast_extract_v16f32_v4f64_1(<16 x float> %a, <4 x double> %passthru, i8 %mask) { +; CHECK-LABEL: mask_cast_extract_v16f32_v4f64_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> + %shuffle.cast = bitcast <8 x float> %shuffle to <4 x double> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %res = select <4 x i1> %mask.extract, <4 x double> %shuffle.cast, <4 x double> %passthru + ret <4 x double> %res +} + +define <2 x i64> @mask_cast_extract_v16i32_v2i64_1(<16 x i32> %a, <2 x i64> %passthru, i8 %mask) { +; CHECK-LABEL: mask_cast_extract_v16i32_v2i64_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> + %shuffle.cast = bitcast <4 x i32> %shuffle to <2 x i64> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle.cast, <2 x i64> %passthru + ret <2 x i64> %res +} + +define <2 x double> @mask_cast_extract_v16f32_v2f64_1(<16 x float> %a, <2 x double> %passthru, i8 %mask) { +; CHECK-LABEL: mask_cast_extract_v16f32_v2f64_1: +; CHECK: # BB#0: +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> + %shuffle.cast = bitcast <4 x float> %shuffle to <2 x double> + %mask.cast = bitcast i8 %mask to <8 x i1> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %res = select <2 x i1> %mask.extract, <2 x double> %shuffle.cast, <2 x double> %passthru + ret <2 x double> %res +} diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 2f5e177badc..bf32e672138 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -100,73 +100,22 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE41-NEXT: psadbw %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: testv2i64: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: testv2i64: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: testv2i64: -; AVX512CDVL: # BB#0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; AVX512CDVL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512CDVL-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: testv2i64: -; AVX512CD: # BB#0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512CD-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX512CD-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: retq +; AVX-LABEL: testv2i64: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; X32-SSE-LABEL: testv2i64: ; X32-SSE: # BB#0: @@ -873,81 +822,24 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: testv8i16: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: testv8i16: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: testv8i16: -; AVX512CDVL: # BB#0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: testv8i16: -; AVX512CD: # BB#0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512CD-NEXT: retq +; AVX-LABEL: testv8i16: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; X32-SSE-LABEL: testv8i16: ; X32-SSE: # BB#0: @@ -1071,81 +963,24 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: testv8i16u: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: testv8i16u: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: testv8i16u: -; AVX512CDVL: # BB#0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: testv8i16u: -; AVX512CD: # BB#0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX512CD-NEXT: retq +; AVX-LABEL: testv8i16u: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; X32-SSE-LABEL: testv8i16u: ; X32-SSE: # BB#0: @@ -1253,69 +1088,21 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; SSE41-NEXT: paddb %xmm4, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: testv16i8: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: testv16i8: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: testv16i8: -; AVX512CDVL: # BB#0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: testv16i8: -; AVX512CD: # BB#0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: retq +; AVX-LABEL: testv16i8: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; X32-SSE-LABEL: testv16i8: ; X32-SSE: # BB#0: @@ -1419,69 +1206,21 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; SSE41-NEXT: paddb %xmm4, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: testv16i8u: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: testv16i8u: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512CDVL-LABEL: testv16i8u: -; AVX512CDVL: # BB#0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: retq -; -; AVX512CD-LABEL: testv16i8u: -; AVX512CD: # BB#0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: retq +; AVX-LABEL: testv16i8u: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; X32-SSE-LABEL: testv16i8u: ; X32-SSE: # BB#0: diff --git a/test/CodeGen/X86/vshift-4.ll b/test/CodeGen/X86/vshift-4.ll index 4e5fb60fae8..5d486e79405 100644 --- a/test/CodeGen/X86/vshift-4.ll +++ b/test/CodeGen/X86/vshift-4.ll @@ -9,7 +9,6 @@ define void @shift1a(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind { ; X32-LABEL: shift1a: ; X32: # BB#0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; X32-NEXT: psllq %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl @@ -34,7 +33,6 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind { ; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; X32-NEXT: movdqa %xmm0, %xmm3 ; X32-NEXT: psllq %xmm2, %xmm3 -; X32-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; X32-NEXT: psllq %xmm1, %xmm0 ; X32-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; X32-NEXT: movapd %xmm3, (%eax) diff --git a/test/DebugInfo/Generic/licm-hoist-debug-loc.ll b/test/DebugInfo/Generic/licm-hoist-debug-loc.ll new file mode 100644 index 00000000000..c42396d9059 --- /dev/null +++ b/test/DebugInfo/Generic/licm-hoist-debug-loc.ll @@ -0,0 +1,75 @@ +; RUN: opt -S -licm %s | FileCheck %s +; +; LICM should null out debug locations when it hoists instructions out of a loop. +; +; Generated with +; clang -O0 -S -emit-llvm test.cpp -g -gline-tables-only -o t.ll +; opt -S -sroa -adce -simplifycfg -reassociate -domtree -loops \ +; -loop-simplify -lcssa -basicaa -aa -scalar-evolution -loop-rotate t.ll > test.ll +; +; void bar(int *); +; void foo(int k, int p) +; { +; for (int i = 0; i < k; i++) { +; bar(&p + 4); +; } +; } +; +; We make sure that the instruction that is hoisted into the preheader +; does not have a debug location. +; CHECK: for.body.lr.ph: +; CHECK: getelementptr{{.*}}%p.addr, i64 4{{$}} +; CHECK: for.body: +; +; ModuleID = 't.ll' +source_filename = "test.c" + +; Function Attrs: nounwind sspstrong uwtable +define void @foo(i32 %k, i32 %p) !dbg !7 { +entry: + %p.addr = alloca i32, align 4 + store i32 %p, i32* %p.addr, align 4 + %cmp2 = icmp slt i32 0, %k, !dbg !9 + br i1 %cmp2, label %for.body.lr.ph, label %for.end, !dbg !9 + +for.body.lr.ph: ; preds = %entry + br label %for.body, !dbg !9 + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %add.ptr = getelementptr inbounds i32, i32* %p.addr, i64 4, !dbg !11 + call void @bar(i32* %add.ptr), !dbg !11 + %inc = add nsw i32 %i.03, 1, !dbg !12 + %cmp = icmp slt i32 %inc, %k, !dbg !9 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !dbg !9, !llvm.loop !14 + +for.cond.for.end_crit_edge: ; preds = %for.body + br label %for.end, !dbg !9 + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + ret void, !dbg !16 +} + +declare void @bar(i32*) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.1 (PS4 clang version 4.50.0.249 7e7cd823 checking)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2) +!1 = !DIFile(filename: "test.c", directory: "D:\test") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"PIC Level", i32 2} +!6 = !{!"clang version 3.9.1 (PS4 clang version 4.50.0.249 7e7cd823 checking)"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 4, scope: !10) +!10 = !DILexicalBlockFile(scope: !7, file: !1, discriminator: 1) +!11 = !DILocation(line: 5, scope: !7) +!12 = !DILocation(line: 4, scope: !13) +!13 = !DILexicalBlockFile(scope: !7, file: !1, discriminator: 2) +!14 = distinct !{!14, !15} +!15 = !DILocation(line: 4, scope: !7) +!16 = !DILocation(line: 7, scope: !7) diff --git a/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll b/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll index 0667685befc..ddfd7ca7c36 100644 --- a/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll +++ b/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll @@ -73,7 +73,43 @@ define void @store.v4i64.0001(<4 x i32*> %arg) sanitize_address { define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) sanitize_address { ; ALL-LABEL: @store.v4f32.variable %p = load <4 x float>*, <4 x float>** @v4f32, align 8 -; ALL-NOT: call void @__asan_store +; STORE: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0 +; STORE: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]] +; STORE: