From dd9b55d3d923a40aefd5441a2345f309cf2e156d Mon Sep 17 00:00:00 2001
From: dim <dim@FreeBSD.org>
Date: Fri, 6 Jan 2017 20:13:21 +0000
Subject: [PATCH] Vendor import of llvm trunk r291274:
 https://llvm.org/svn/llvm-project/llvm/trunk@291274

---
 cmake/config-ix.cmake                         |    7 +-
 cmake/modules/AddLLVM.cmake                   |    4 +
 docs/CompileCudaWithLLVM.rst                  |    4 +-
 docs/Phabricator.rst                          |    2 +-
 include/llvm/Analysis/CGSCCPassManager.h      |    2 +-
 include/llvm/Analysis/TargetTransformInfo.h   |   19 +-
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   30 +-
 include/llvm/Bitcode/BitCodes.h               |    6 +-
 include/llvm/Bitcode/BitstreamReader.h        |   11 +-
 include/llvm/Bitcode/BitstreamWriter.h        |   26 +-
 include/llvm/CodeGen/AsmPrinter.h             |    7 +
 include/llvm/CodeGen/BasicTTIImpl.h           |    5 +-
 include/llvm/CodeGen/DIE.h                    |    9 +-
 .../llvm/CodeGen/GlobalISel/IRTranslator.h    |    6 +-
 include/llvm/CodeGen/MachineBasicBlock.h      |    2 +-
 include/llvm/DebugInfo/DWARF/DWARFDie.h       |   58 +-
 .../llvm/ExecutionEngine/Orc/RawByteChannel.h |    2 +-
 include/llvm/IR/ModuleSummaryIndex.h          |  112 +-
 include/llvm/IR/ModuleSummaryIndexYAML.h      |  111 +
 include/llvm/IR/PassManager.h                 |  250 +-
 include/llvm/LTO/LTO.h                        |    7 +
 include/llvm/MC/MCTargetOptions.h             |   10 +-
 include/llvm/Support/FileSystem.h             |   19 +-
 include/llvm/Support/TarWriter.h              |   32 +
 include/llvm/Transforms/IPO/FunctionImport.h  |   13 +-
 include/llvm/Transforms/IPO/LowerTypeTests.h  |    4 -
 .../Transforms/Utils/FunctionImportUtils.h    |   18 +
 include/llvm/module.modulemap                 |    1 +
 lib/Analysis/ModuleSummaryAnalysis.cpp        |  120 +-
 lib/Analysis/TargetTransformInfo.cpp          |    5 +-
 lib/Bitcode/Reader/BitcodeReader.cpp          |   16 +-
 lib/Bitcode/Reader/BitstreamReader.cpp        |   22 +-
 lib/Bitcode/Reader/MetadataLoader.cpp         |  400 +++-
 lib/Bitcode/Writer/BitcodeWriter.cpp          |  191 +-
 lib/CodeGen/AsmPrinter/ARMException.cpp       |    3 +-
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp         |   24 +-
 .../AsmPrinter/AsmPrinterInlineAsm.cpp        |    2 +
 lib/CodeGen/AsmPrinter/DwarfCFIException.cpp  |    2 +-
 lib/CodeGen/GlobalISel/IRTranslator.cpp       |   44 +-
 lib/CodeGen/GlobalISel/RegisterBankInfo.cpp   |   13 +-
 lib/CodeGen/IfConversion.cpp                  |   42 +-
 lib/CodeGen/MIRPrinter.cpp                    |    8 +-
 lib/CodeGen/MachineBasicBlock.cpp             |    9 +-
 lib/CodeGen/MachineVerifier.cpp               |   18 +-
 lib/CodeGen/RegisterScavenging.cpp            |    5 -
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   17 +-
 lib/DebugInfo/DWARF/DWARFDie.cpp              |    5 +-
 lib/Fuzzer/FuzzerDriver.cpp                   |    1 +
 lib/Fuzzer/FuzzerFlags.def                    |    1 +
 lib/Fuzzer/FuzzerIO.h                         |    3 +
 lib/Fuzzer/FuzzerIOPosix.cpp                  |    6 +
 lib/Fuzzer/FuzzerIOWindows.cpp                |    2 +
 lib/Fuzzer/FuzzerInternal.h                   |    1 +
 lib/Fuzzer/FuzzerLoop.cpp                     |    5 +
 lib/Fuzzer/FuzzerMerge.cpp                    |   12 +-
 lib/Fuzzer/FuzzerOptions.h                    |    1 +
 lib/Fuzzer/FuzzerTraceState.cpp               |   47 -
 lib/Fuzzer/FuzzerUtilPosix.cpp                |    6 +
 lib/Fuzzer/FuzzerUtilWindows.cpp              |    1 +
 lib/Fuzzer/test/merge.test                    |    8 +
 lib/LTO/LTO.cpp                               |   41 +-
 lib/LTO/ThinLTOCodeGenerator.cpp              |   42 +-
 lib/Support/APInt.cpp                         |    2 +-
 lib/Support/CMakeLists.txt                    |    1 +
 lib/Support/Host.cpp                          |   20 +-
 lib/Support/TarWriter.cpp                     |  166 ++
 lib/Support/Unix/Signals.inc                  |    2 +-
 lib/Target/AArch64/AArch64CollectLOH.cpp      | 1123 +++------
 lib/Target/AArch64/AArch64ISelLowering.cpp    |  110 +-
 lib/Target/AArch64/AArch64InstrInfo.cpp       |  190 +-
 lib/Target/AArch64/AArch64InstrInfo.h         |    4 +
 .../AArch64/AArch64InstructionSelector.cpp    |   10 +-
 .../AArch64/AArch64InstructionSelector.h      |    8 +-
 .../AArch64/AArch64MachineFunctionInfo.h      |   48 +-
 lib/Target/AArch64/AArch64Subtarget.cpp       |    1 +
 lib/Target/AArch64/AArch64TargetMachine.cpp   |   38 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |    7 +-
 .../AArch64/AArch64TargetTransformInfo.h      |    2 +-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |   89 +-
 .../Disassembler/AArch64Disassembler.h        |    9 +-
 .../MCTargetDesc/AArch64ELFObjectWriter.cpp   |   13 +-
 .../MCTargetDesc/AArch64MCCodeEmitter.cpp     |   27 +-
 .../MCTargetDesc/AArch64TargetStreamer.cpp    |    3 +-
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |   10 +-
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |    2 +-
 lib/Target/ARM/ARMTargetTransformInfo.cpp     |    7 +-
 lib/Target/ARM/ARMTargetTransformInfo.h       |    3 +-
 lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp |   48 +-
 .../Lanai/Disassembler/LanaiDisassembler.h    |    7 +-
 .../Lanai/InstPrinter/LanaiInstPrinter.h      |   13 +-
 lib/Target/Lanai/LanaiISelLowering.cpp        |   42 +-
 lib/Target/Lanai/LanaiRegisterInfo.h          |    9 +-
 .../MCTargetDesc/LanaiELFObjectWriter.cpp     |   12 +-
 .../Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp |   29 +-
 .../Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp  |   15 +-
 lib/Target/PowerPC/PPCISelLowering.h          |    2 +-
 lib/Target/PowerPC/PPCInstr64Bit.td           |    3 +
 lib/Target/PowerPC/PPCInstrFormats.td         |    6 +
 lib/Target/PowerPC/PPCInstrInfo.td            |    8 +
 lib/Target/X86/X86ISelLowering.cpp            |  172 +-
 lib/Target/X86/X86TargetTransformInfo.cpp     |  630 +++---
 lib/Target/X86/X86TargetTransformInfo.h       |    3 +-
 lib/Transforms/IPO/FunctionImport.cpp         |  182 +-
 lib/Transforms/IPO/LowerTypeTests.cpp         |  316 ++-
 .../InstCombine/InstCombineCalls.cpp          |   14 +
 .../Instrumentation/AddressSanitizer.cpp      |   96 +-
 lib/Transforms/Scalar/GVN.cpp                 |   14 +-
 lib/Transforms/Scalar/LICM.cpp                |   37 +-
 lib/Transforms/Scalar/LoopIdiomRecognize.cpp  |    2 +-
 lib/Transforms/Scalar/LoopSink.cpp            |    3 +-
 lib/Transforms/Utils/FunctionImportUtils.cpp  |   24 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    |   59 +-
 test/Analysis/CostModel/AArch64/bswap.ll      |   70 +
 test/Analysis/CostModel/AArch64/falkor.ll     |   26 +
 test/Analysis/CostModel/AArch64/gep.ll        |   66 +-
 test/Analysis/CostModel/X86/arith.ll          |    4 +-
 .../CostModel/X86/shuffle-broadcast.ll        |  140 +-
 test/Analysis/CostModel/X86/vdiv-cost.ll      |   66 +-
 .../CostModel/X86/vshift-ashr-cost.ll         |  256 ++-
 .../CostModel/X86/vshift-lshr-cost.ll         |  259 ++-
 .../Analysis/CostModel/X86/vshift-shl-cost.ll |  261 ++-
 test/Bitcode/summary_version.ll               |    2 +-
 test/Bitcode/thinlto-function-summary.ll      |    2 +-
 test/Bitcode/thinlto-summary-section.ll       |    8 +-
 .../GlobalISel/arm64-instructionselect.mir    |    6 +-
 .../AArch64/GlobalISel/arm64-irtranslator.ll  |   88 +-
 .../GlobalISel/irtranslator-exceptions.ll     |    4 +-
 .../arm64-collect-loh-garbage-crash.ll        |    2 +-
 test/CodeGen/AArch64/arm64-collect-loh-str.ll |    2 +-
 test/CodeGen/AArch64/arm64-collect-loh.ll     |   17 +-
 test/CodeGen/AArch64/loh.mir                  |  193 ++
 test/CodeGen/AArch64/machine-scheduler.mir    |    5 +-
 test/CodeGen/AMDGPU/hsa-func.ll               |    3 +-
 test/CodeGen/AMDGPU/hsa.ll                    |    4 +-
 test/CodeGen/Generic/cfi-sections.ll          |   39 +
 test/CodeGen/MIR/AArch64/spill-fold.mir       |   82 +
 test/CodeGen/MIR/X86/basic-block-liveins.mir  |    9 +-
 test/CodeGen/MIR/X86/machine-verifier.mir     |    3 +-
 test/CodeGen/NVPTX/tid-range.ll               |   18 +
 .../X86/GlobalISel/irtranslator-call.ll       |    2 +-
 test/CodeGen/X86/avx512-intrinsics-upgrade.ll |    7 +-
 test/CodeGen/X86/avx512-trunc.ll              |  107 +
 test/CodeGen/X86/cmov.ll                      |   18 +
 test/CodeGen/X86/lower-vec-shift-2.ll         |   18 +-
 test/CodeGen/X86/shuffle-vs-trunc-128.ll      |  481 ++++
 test/CodeGen/X86/shuffle-vs-trunc-256.ll      |  629 ++++++
 test/CodeGen/X86/shuffle-vs-trunc-512.ll      |  537 +++++
 test/CodeGen/X86/tail-call-conditional.mir    |    3 +-
 test/CodeGen/X86/vector-rotate-128.ll         |    2 -
 test/CodeGen/X86/vector-shift-ashr-128.ll     |  100 +-
 test/CodeGen/X86/vector-shift-ashr-256.ll     |   74 +-
 test/CodeGen/X86/vector-shift-ashr-512.ll     |  321 ++-
 test/CodeGen/X86/vector-shift-lshr-128.ll     |   74 +-
 test/CodeGen/X86/vector-shift-lshr-256.ll     |   74 +-
 test/CodeGen/X86/vector-shift-lshr-512.ll     |  317 ++-
 test/CodeGen/X86/vector-shift-shl-128.ll      |   74 +-
 test/CodeGen/X86/vector-shift-shl-256.ll      |   74 +-
 test/CodeGen/X86/vector-shift-shl-512.ll      |  317 ++-
 test/CodeGen/X86/vector-shuffle-128-v16.ll    |   89 +-
 test/CodeGen/X86/vector-shuffle-128-v4.ll     |   83 +-
 test/CodeGen/X86/vector-shuffle-128-v8.ll     |  112 +-
 test/CodeGen/X86/vector-shuffle-masked.ll     |  450 ++++
 test/CodeGen/X86/vector-tzcnt-128.ll          |  425 +---
 test/CodeGen/X86/vshift-4.ll                  |    2 -
 .../DebugInfo/Generic/licm-hoist-debug-loc.ll |   75 +
 .../asan-masked-load-store.ll                 |   76 +-
 test/MC/AsmParser/Inputs/function.x           |    3 +
 test/MC/AsmParser/Inputs/module.x             |    3 +
 test/MC/AsmParser/include.ll                  |   13 +
 .../PowerPC/ppc64-encoding-fp.txt             |   18 +
 test/MC/PowerPC/ppc64-encoding-fp.s           |   32 +-
 test/ThinLTO/X86/Inputs/deadstrip.ll          |   22 +
 test/ThinLTO/X86/Inputs/lazyload_metadata.ll  |   12 +
 test/ThinLTO/X86/deadstrip.ll                 |  109 +
 test/ThinLTO/X86/lazyload_metadata.ll         |   54 +
 test/Transforms/GVN/PRE/phi-translate.ll      |   13 +-
 .../InstCombine/amdgcn-intrinsics.ll          |   34 +
 test/Transforms/InstCombine/cos-intrinsic.ll  |   55 +
 test/Transforms/InstCombine/icmp-shl-nsw.ll   |  218 ++
 test/Transforms/InstCombine/icmp.ll           |   63 -
 test/Transforms/InstSimplify/select.ll        |   28 +
 test/Transforms/LICM/scalar_promote.ll        |  192 ++
 .../LoopVectorize/X86/strided_load_cost.ll    |   54 +
 .../LowerTypeTests/Inputs/import-unsat.yaml   |   10 +
 .../LowerTypeTests/export-nothing.ll          |    7 +
 .../LowerTypeTests/function-disjoint.ll       |    6 +-
 .../Transforms/LowerTypeTests/function-ext.ll |    3 +-
 test/Transforms/LowerTypeTests/function.ll    |    2 +-
 .../Transforms/LowerTypeTests/import-unsat.ll |   23 +
 test/Transforms/LowerTypeTests/simple.ll      |    2 +-
 .../LowerTypeTests/single-offset.ll           |    2 +-
 test/Transforms/LowerTypeTests/unsat.ll       |    3 +-
 tools/dsymutil/DwarfLinker.cpp                |   12 +-
 tools/llc/llc.cpp                             |    3 +
 tools/llvm-config/llvm-config.cpp             |   10 +-
 unittests/ADT/APFloatTest.cpp                 |  275 ++-
 unittests/ADT/IntrusiveRefCntPtrTest.cpp      |    4 +-
 unittests/Bitcode/BitstreamReaderTest.cpp     |    4 +-
 .../DebugInfo/DWARF/DWARFDebugInfoTest.cpp    |  121 +
 unittests/DebugInfo/DWARF/DwarfGenerator.cpp  |    4 +
 unittests/DebugInfo/DWARF/DwarfGenerator.h    |    3 +
 utils/lit/lit/formats/googletest.py           |    8 +-
 utils/unittest/CMakeLists.txt                 |    6 +
 utils/unittest/googletest/README.LLVM         |   16 +-
 .../include/gtest/gtest-death-test.h          |   17 +-
 .../googletest/include/gtest/gtest-message.h  |  112 +-
 .../include/gtest/gtest-param-test.h          |   56 +-
 .../googletest/include/gtest/gtest-printers.h |  335 ++-
 .../googletest/include/gtest/gtest-spi.h      |    7 +-
 .../include/gtest/gtest-test-part.h           |   23 +-
 .../include/gtest/gtest-typed-test.h          |   11 +-
 .../unittest/googletest/include/gtest/gtest.h |  565 +++--
 .../include/gtest/gtest_pred_impl.h           |   12 +-
 .../gtest/internal/custom/gtest-port.h        |   69 +
 .../gtest/internal/custom/gtest-printers.h    |   42 +
 .../include/gtest/internal/custom/gtest.h     |   41 +
 .../internal/gtest-death-test-internal.h      |   29 +-
 .../include/gtest/internal/gtest-filepath.h   |   16 +-
 .../include/gtest/internal/gtest-internal.h   |  403 ++--
 .../include/gtest/internal/gtest-linked_ptr.h |   22 +-
 .../internal/gtest-param-util-generated.h     |  679 ++++--
 .../include/gtest/internal/gtest-param-util.h |  190 +-
 .../include/gtest/internal/gtest-port-arch.h  |   97 +
 .../include/gtest/internal/gtest-port.h       | 1219 ++++++++--
 .../include/gtest/internal/gtest-string.h     |  217 +-
 .../include/gtest/internal/gtest-tuple.h      |  100 +-
 .../include/gtest/internal/gtest-type-util.h  |   21 +-
 .../googletest/src/gtest-death-test.cc        |  344 ++-
 .../unittest/googletest/src/gtest-filepath.cc |   43 +-
 .../googletest/src/gtest-internal-inl.h       |  332 ++-
 utils/unittest/googletest/src/gtest-port.cc   |  699 +++++-
 .../unittest/googletest/src/gtest-printers.cc |  115 +-
 .../googletest/src/gtest-test-part.cc         |   12 +-
 .../googletest/src/gtest-typed-test.cc        |   42 +-
 utils/unittest/googletest/src/gtest.cc        | 2012 +++++++++++------
 235 files changed, 14791 insertions(+), 6255 deletions(-)
 create mode 100644 include/llvm/IR/ModuleSummaryIndexYAML.h
 create mode 100644 include/llvm/Support/TarWriter.h
 create mode 100644 lib/Support/TarWriter.cpp
 create mode 100644 test/Analysis/CostModel/AArch64/bswap.ll
 create mode 100644 test/Analysis/CostModel/AArch64/falkor.ll
 create mode 100644 test/CodeGen/AArch64/loh.mir
 create mode 100644 test/CodeGen/Generic/cfi-sections.ll
 create mode 100644 test/CodeGen/MIR/AArch64/spill-fold.mir
 create mode 100644 test/CodeGen/NVPTX/tid-range.ll
 create mode 100644 test/CodeGen/X86/shuffle-vs-trunc-128.ll
 create mode 100644 test/CodeGen/X86/shuffle-vs-trunc-256.ll
 create mode 100644 test/CodeGen/X86/shuffle-vs-trunc-512.ll
 create mode 100644 test/DebugInfo/Generic/licm-hoist-debug-loc.ll
 create mode 100644 test/MC/AsmParser/Inputs/function.x
 create mode 100644 test/MC/AsmParser/Inputs/module.x
 create mode 100644 test/MC/AsmParser/include.ll
 create mode 100644 test/ThinLTO/X86/Inputs/deadstrip.ll
 create mode 100644 test/ThinLTO/X86/Inputs/lazyload_metadata.ll
 create mode 100644 test/ThinLTO/X86/deadstrip.ll
 create mode 100644 test/ThinLTO/X86/lazyload_metadata.ll
 create mode 100644 test/Transforms/InstCombine/icmp-shl-nsw.ll
 create mode 100644 test/Transforms/LoopVectorize/X86/strided_load_cost.ll
 create mode 100644 test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
 create mode 100644 test/Transforms/LowerTypeTests/export-nothing.ll
 create mode 100644 test/Transforms/LowerTypeTests/import-unsat.ll
 create mode 100644 utils/unittest/googletest/include/gtest/internal/custom/gtest-port.h
 create mode 100644 utils/unittest/googletest/include/gtest/internal/custom/gtest-printers.h
 create mode 100644 utils/unittest/googletest/include/gtest/internal/custom/gtest.h
 create mode 100644 utils/unittest/googletest/include/gtest/internal/gtest-port-arch.h

diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 530a5ddaab4..d76f1293d02 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -167,7 +167,10 @@ check_symbol_exists(futimens sys/stat.h HAVE_FUTIMENS)
 check_symbol_exists(futimes sys/time.h HAVE_FUTIMES)
 check_symbol_exists(posix_fallocate fcntl.h HAVE_POSIX_FALLOCATE)
 # AddressSanitizer conflicts with lib/Support/Unix/Signals.inc
-if( HAVE_SIGNAL_H AND NOT LLVM_USE_SANITIZER MATCHES ".*Address.*")
+# Avoid sigaltstack on Apple platforms, where backtrace() cannot handle it
+# (rdar://7089625) and _Unwind_Backtrace is unusable because it cannot unwind
+# past the signal handler after an assertion failure (rdar://29866587).
+if( HAVE_SIGNAL_H AND NOT LLVM_USE_SANITIZER MATCHES ".*Address.*" AND NOT APPLE )
   check_symbol_exists(sigaltstack signal.h HAVE_SIGALTSTACK)
 endif()
 if( HAVE_SYS_UIO_H )
@@ -314,6 +317,8 @@ else()
 endif()
 
 check_cxx_compiler_flag("-Wno-variadic-macros" SUPPORTS_NO_VARIADIC_MACROS_FLAG)
+check_cxx_compiler_flag("-Wno-gnu-zero-variadic-macro-arguments"
+                        SUPPORTS_NO_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
 
 set(USE_NO_MAYBE_UNINITIALIZED 0)
 set(USE_NO_UNINITIALIZED 0)
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index f35fcf444d2..fbef1d04eac 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -1014,6 +1014,10 @@ function(add_unittest test_suite test_name)
   if (SUPPORTS_NO_VARIADIC_MACROS_FLAG)
     list(APPEND LLVM_COMPILE_FLAGS "-Wno-variadic-macros")
   endif ()
+  # Some parts of gtest rely on this GNU extension, don't warn on it.
+  if(SUPPORTS_NO_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
+    list(APPEND LLVM_COMPILE_FLAGS "-Wno-gnu-zero-variadic-macro-arguments")
+  endif()
 
   set(LLVM_REQUIRES_RTTI OFF)
 
diff --git a/docs/CompileCudaWithLLVM.rst b/docs/CompileCudaWithLLVM.rst
index af681aeead6..6ad8652cfc1 100644
--- a/docs/CompileCudaWithLLVM.rst
+++ b/docs/CompileCudaWithLLVM.rst
@@ -35,8 +35,8 @@ by many Linux package managers; you probably need to install nvidia's package.
 
 You will need CUDA 7.0, 7.5, or 8.0 to compile with clang.
 
-CUDA compilation is supported on Linux, and on MacOS as of XXXX-XX-XX.  Windows
-support is planned but not yet in place.
+CUDA compilation is supported on Linux, on MacOS as of 2016-11-18, and on
+Windows as of 2017-01-05.
 
 Invoking clang
 --------------
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index 06a9c6af9b4..8d1984b65cd 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -132,7 +132,7 @@ committed to trunk. If you do not have commit access, someone has to
 commit the change for you (with attribution). It is sufficient to add
 a comment to the approved review indicating you cannot commit the patch
 yourself. If you have commit access, there are multiple workflows to commit the
-change. Whichever method you follow it is recommend that your commit message
+change. Whichever method you follow it is recommended that your commit message
 ends with the line:
 
 ::
diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h
index 54ef1a688d3..6fbe532112b 100644
--- a/include/llvm/Analysis/CGSCCPassManager.h
+++ b/include/llvm/Analysis/CGSCCPassManager.h
@@ -128,7 +128,7 @@ extern template class PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager,
 /// \brief The CGSCC pass manager.
 ///
 /// See the documentation for the PassManager template for details. It runs
-/// a sequency of SCC passes over each SCC that the manager is run over. This
+/// a sequence of SCC passes over each SCC that the manager is run over. This
 /// typedef serves as a convenient way to refer to this construct.
 typedef PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
                     CGSCCUpdateResult &>
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index d583614284f..b4a6c5c2fae 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -36,6 +36,8 @@ namespace llvm {
 class Function;
 class GlobalValue;
 class Loop;
+class ScalarEvolution;
+class SCEV;
 class Type;
 class User;
 class Value;
@@ -613,10 +615,11 @@ class TargetTransformInfo {
   /// merged into the instruction indexing mode. Some targets might want to
   /// distinguish between address computation for memory operations on vector
   /// types and scalar types. Such targets should override this function.
-  /// The 'IsComplex' parameter is a hint that the address computation is likely
-  /// to involve multiple instructions and as such unlikely to be merged into
-  /// the address indexing mode.
-  int getAddressComputationCost(Type *Ty, bool IsComplex = false) const;
+  /// The 'SE' parameter holds pointer for the scalar evolution object which
+  /// is used in order to get the Ptr step value in case of constant stride.
+  /// The 'Ptr' parameter holds SCEV of the access pointer.
+  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE = nullptr,
+                                const SCEV *Ptr = nullptr) const;
 
   /// \returns The cost, if any, of keeping values of the given types alive
   /// over a callsite.
@@ -795,7 +798,8 @@ class TargetTransformInfo::Concept {
   virtual int getCallInstrCost(Function *F, Type *RetTy,
                                ArrayRef<Type *> Tys) = 0;
   virtual unsigned getNumberOfParts(Type *Tp) = 0;
-  virtual int getAddressComputationCost(Type *Ty, bool IsComplex) = 0;
+  virtual int getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+                                        const SCEV *Ptr) = 0;
   virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0;
   virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
                                   MemIntrinsicInfo &Info) = 0;
@@ -1044,8 +1048,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getNumberOfParts(Type *Tp) override {
     return Impl.getNumberOfParts(Tp);
   }
-  int getAddressComputationCost(Type *Ty, bool IsComplex) override {
-    return Impl.getAddressComputationCost(Ty, IsComplex);
+  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+                                const SCEV *Ptr) override {
+    return Impl.getAddressComputationCost(Ty, SE, Ptr);
   }
   unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) override {
     return Impl.getCostOfKeepingLiveOverCall(Tys);
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 68b38a7fa53..1d7edbaf7df 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFOIMPL_H
 #define LLVM_ANALYSIS_TARGETTRANSFORMINFOIMPL_H
 
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
@@ -370,7 +371,10 @@ class TargetTransformInfoImplBase {
 
   unsigned getNumberOfParts(Type *Tp) { return 0; }
 
-  unsigned getAddressComputationCost(Type *Tp, bool) { return 0; }
+  unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *,
+                                     const SCEV *) {
+    return 0; 
+  }
 
   unsigned getReductionCost(unsigned, Type *, bool) { return 1; }
 
@@ -422,6 +426,30 @@ class TargetTransformInfoImplBase {
                                 VectorType *VecTy) const {
     return VF;
   }
+protected:
+  bool isStridedAccess(const SCEV *Ptr) {
+    return Ptr && isa<SCEVAddRecExpr>(Ptr);
+  }
+
+  const SCEVConstant *getConstantStrideStep(ScalarEvolution *SE,
+                                            const SCEV *Ptr) {
+    if (!isStridedAccess(Ptr))
+      return nullptr;
+    const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ptr);
+    return dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(*SE));
+  }
+
+  bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr,
+                                       int64_t MergeDistance) {
+    const SCEVConstant *Step = getConstantStrideStep(SE, Ptr);
+    if (!Step)
+      return false;
+    APInt StrideVal = Step->getAPInt();
+    if (StrideVal.getBitWidth() > 64)
+      return false;
+    // FIXME: need to take absolute value for negtive stride case  
+    return StrideVal.getSExtValue() < MergeDistance;
+  }
 };
 
 /// \brief CRTP base class for use as a mix-in that aids implementing
diff --git a/include/llvm/Bitcode/BitCodes.h b/include/llvm/Bitcode/BitCodes.h
index cfc7a1d7d6b..bf21e146e77 100644
--- a/include/llvm/Bitcode/BitCodes.h
+++ b/include/llvm/Bitcode/BitCodes.h
@@ -18,7 +18,6 @@
 #ifndef LLVM_BITCODE_BITCODES_H
 #define LLVM_BITCODE_BITCODES_H
 
-#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -166,11 +165,8 @@ template <> struct isPodLike<BitCodeAbbrevOp> { static const bool value=true; };
 /// BitCodeAbbrev - This class represents an abbreviation record.  An
 /// abbreviation allows a complex record that has redundancy to be stored in a
 /// specialized format instead of the fully-general, fully-vbr, format.
-class BitCodeAbbrev : public RefCountedBase<BitCodeAbbrev> {
+class BitCodeAbbrev {
   SmallVector<BitCodeAbbrevOp, 32> OperandList;
-  // Only RefCountedBase is allowed to delete.
-  ~BitCodeAbbrev() = default;
-  friend class RefCountedBase<BitCodeAbbrev>;
 
 public:
   unsigned getNumOperandInfos() const {
diff --git a/include/llvm/Bitcode/BitstreamReader.h b/include/llvm/Bitcode/BitstreamReader.h
index 4d95a6ce8a1..fc06eeefbf2 100644
--- a/include/llvm/Bitcode/BitstreamReader.h
+++ b/include/llvm/Bitcode/BitstreamReader.h
@@ -16,7 +16,6 @@
 #define LLVM_BITCODE_BITSTREAMREADER_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Bitcode/BitCodes.h"
 #include "llvm/Support/Endian.h"
@@ -42,7 +41,7 @@ class BitstreamBlockInfo {
   /// describe abbreviations that all blocks of the specified ID inherit.
   struct BlockInfo {
     unsigned BlockID;
-    std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> Abbrevs;
+    std::vector<std::shared_ptr<BitCodeAbbrev>> Abbrevs;
     std::string Name;
     std::vector<std::pair<unsigned, std::string> > RecordNames;
   };
@@ -316,11 +315,11 @@ class BitstreamCursor : SimpleBitstreamCursor {
   unsigned CurCodeSize = 2;
 
   /// Abbrevs installed at in this block.
-  std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> CurAbbrevs;
+  std::vector<std::shared_ptr<BitCodeAbbrev>> CurAbbrevs;
 
   struct Block {
     unsigned PrevCodeSize;
-    std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> PrevAbbrevs;
+    std::vector<std::shared_ptr<BitCodeAbbrev>> PrevAbbrevs;
 
     explicit Block(unsigned PCS) : PrevCodeSize(PCS) {}
   };
@@ -478,8 +477,8 @@ class BitstreamCursor : SimpleBitstreamCursor {
     return CurAbbrevs[AbbrevNo].get();
   }
 
-  /// Read the current record and discard it.
-  void skipRecord(unsigned AbbrevID);
+  /// Read the current record and discard it, returning the code for the record.
+  unsigned skipRecord(unsigned AbbrevID);
 
   unsigned readRecord(unsigned AbbrevID, SmallVectorImpl<uint64_t> &Vals,
                       StringRef *Blob = nullptr);
diff --git a/include/llvm/Bitcode/BitstreamWriter.h b/include/llvm/Bitcode/BitstreamWriter.h
index 8eb6e8aef7a..e276db5f92f 100644
--- a/include/llvm/Bitcode/BitstreamWriter.h
+++ b/include/llvm/Bitcode/BitstreamWriter.h
@@ -43,12 +43,12 @@ class BitstreamWriter {
   unsigned BlockInfoCurBID;
 
   /// CurAbbrevs - Abbrevs installed at in this block.
-  std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> CurAbbrevs;
+  std::vector<std::shared_ptr<BitCodeAbbrev>> CurAbbrevs;
 
   struct Block {
     unsigned PrevCodeSize;
     size_t StartSizeWord;
-    std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> PrevAbbrevs;
+    std::vector<std::shared_ptr<BitCodeAbbrev>> PrevAbbrevs;
     Block(unsigned PCS, size_t SSW) : PrevCodeSize(PCS), StartSizeWord(SSW) {}
   };
 
@@ -59,7 +59,7 @@ class BitstreamWriter {
   /// These describe abbreviations that all blocks of the specified ID inherit.
   struct BlockInfo {
     unsigned BlockID;
-    std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> Abbrevs;
+    std::vector<std::shared_ptr<BitCodeAbbrev>> Abbrevs;
   };
   std::vector<BlockInfo> BlockInfoRecords;
 
@@ -469,12 +469,12 @@ class BitstreamWriter {
 
 private:
   // Emit the abbreviation as a DEFINE_ABBREV record.
-  void EncodeAbbrev(BitCodeAbbrev *Abbv) {
+  void EncodeAbbrev(const BitCodeAbbrev &Abbv) {
     EmitCode(bitc::DEFINE_ABBREV);
-    EmitVBR(Abbv->getNumOperandInfos(), 5);
-    for (unsigned i = 0, e = static_cast<unsigned>(Abbv->getNumOperandInfos());
+    EmitVBR(Abbv.getNumOperandInfos(), 5);
+    for (unsigned i = 0, e = static_cast<unsigned>(Abbv.getNumOperandInfos());
          i != e; ++i) {
-      const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
+      const BitCodeAbbrevOp &Op = Abbv.getOperandInfo(i);
       Emit(Op.isLiteral(), 1);
       if (Op.isLiteral()) {
         EmitVBR64(Op.getLiteralValue(), 8);
@@ -489,10 +489,10 @@ class BitstreamWriter {
 
   /// EmitAbbrev - This emits an abbreviation to the stream.  Note that this
   /// method takes ownership of the specified abbrev.
-  unsigned EmitAbbrev(BitCodeAbbrev *Abbv) {
+  unsigned EmitAbbrev(std::shared_ptr<BitCodeAbbrev> Abbv) {
     // Emit the abbreviation as a record.
-    EncodeAbbrev(Abbv);
-    CurAbbrevs.push_back(Abbv);
+    EncodeAbbrev(*Abbv);
+    CurAbbrevs.push_back(std::move(Abbv));
     return static_cast<unsigned>(CurAbbrevs.size())-1 +
       bitc::FIRST_APPLICATION_ABBREV;
   }
@@ -532,13 +532,13 @@ class BitstreamWriter {
 
   /// EmitBlockInfoAbbrev - Emit a DEFINE_ABBREV record for the specified
   /// BlockID.
-  unsigned EmitBlockInfoAbbrev(unsigned BlockID, BitCodeAbbrev *Abbv) {
+  unsigned EmitBlockInfoAbbrev(unsigned BlockID, std::shared_ptr<BitCodeAbbrev> Abbv) {
     SwitchToBlockID(BlockID);
-    EncodeAbbrev(Abbv);
+    EncodeAbbrev(*Abbv);
 
     // Add the abbrev to the specified block record.
     BlockInfo &Info = getOrCreateBlockInfo(BlockID);
-    Info.Abbrevs.push_back(Abbv);
+    Info.Abbrevs.push_back(std::move(Abbv));
 
     return Info.Abbrevs.size()-1+bitc::FIRST_APPLICATION_ABBREV;
   }
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index be8822df3db..f0be955110f 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -140,6 +140,9 @@ class AsmPrinter : public MachineFunctionPass {
   /// If the target supports dwarf debug info, this pointer is non-null.
   DwarfDebug *DD;
 
+  /// If the current module uses dwarf CFI annotations strictly for debugging.
+  bool isCFIMoveForDebugging;
+
 protected:
   explicit AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
 
@@ -262,6 +265,10 @@ class AsmPrinter : public MachineFunctionPass {
   enum CFIMoveType { CFI_M_None, CFI_M_EH, CFI_M_Debug };
   CFIMoveType needsCFIMoves();
 
+  /// Returns false if needsCFIMoves() == CFI_M_EH for any function
+  /// in the module.
+  bool needsOnlyDebugCFIMoves() const { return isCFIMoveForDebugging; }
+
   bool needsSEHMoves();
 
   /// Print to the current output stream assembly representations of the
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index df0dc1a38ae..8e96336b981 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -925,7 +925,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return LT.first;
   }
 
-  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) { return 0; }
+  unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *,
+                                     const SCEV *) {
+    return 0; 
+  }
 
   unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) {
     assert(Ty->isVectorTy() && "Expect a vector type");
diff --git a/include/llvm/CodeGen/DIE.h b/include/llvm/CodeGen/DIE.h
index 1e3476cd839..09c3bf6a1b5 100644
--- a/include/llvm/CodeGen/DIE.h
+++ b/include/llvm/CodeGen/DIE.h
@@ -651,6 +651,9 @@ class DIE : IntrusiveBackListNode, public DIEValueList {
   unsigned AbbrevNumber = ~0u;
   /// Dwarf tag code.
   dwarf::Tag Tag = (dwarf::Tag)0;
+  /// Set to true to force a DIE to emit an abbreviation that says it has
+  /// children even when it doesn't. This is used for unit testing purposes.
+  bool ForceChildren;
   /// Children DIEs.
   IntrusiveBackList<DIE> Children;
 
@@ -659,7 +662,8 @@ class DIE : IntrusiveBackListNode, public DIEValueList {
   PointerUnion<DIE *, DIEUnit *> Owner;
 
   DIE() = delete;
-  explicit DIE(dwarf::Tag Tag) : Offset(0), Size(0), Tag(Tag) {}
+  explicit DIE(dwarf::Tag Tag) : Offset(0), Size(0), Tag(Tag),
+      ForceChildren(false) {}
 
 public:
   static DIE *get(BumpPtrAllocator &Alloc, dwarf::Tag Tag) {
@@ -677,7 +681,8 @@ class DIE : IntrusiveBackListNode, public DIEValueList {
   /// Get the compile/type unit relative offset of this DIE.
   unsigned getOffset() const { return Offset; }
   unsigned getSize() const { return Size; }
-  bool hasChildren() const { return !Children.empty(); }
+  bool hasChildren() const { return ForceChildren || !Children.empty(); }
+  void setForceChildren(bool B) { ForceChildren = B; }
 
   typedef IntrusiveBackList<DIE>::iterator child_iterator;
   typedef IntrusiveBackList<DIE>::const_iterator const_child_iterator;
diff --git a/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 76e0d47ceea..26ba5c67beb 100644
--- a/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -180,6 +180,8 @@ class IRTranslator : public MachineFunctionPass {
   /// \pre \p U is a branch instruction.
   bool translateBr(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateSwitch(const User &U, MachineIRBuilder &MIRBuilder);
+
   bool translateExtractValue(const User &U, MachineIRBuilder &MIRBuilder);
 
   bool translateInsertValue(const User &U, MachineIRBuilder &MIRBuilder);
@@ -292,12 +294,8 @@ class IRTranslator : public MachineFunctionPass {
     return translateBinaryOp(TargetOpcode::G_FREM, U, MIRBuilder);
   }
 
-
   // Stubs to keep the compiler happy while we implement the rest of the
   // translation.
-  bool translateSwitch(const User &U, MachineIRBuilder &MIRBuilder) {
-    return false;
-  }
   bool translateIndirectBr(const User &U, MachineIRBuilder &MIRBuilder) {
     return false;
   }
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index be811c6fe43..92a9896d7a1 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -308,7 +308,7 @@ class MachineBasicBlock
   // Iteration support for live in sets.  These sets are kept in sorted
   // order by their register number.
   typedef LiveInVector::const_iterator livein_iterator;
-  livein_iterator livein_begin() const { return LiveIns.begin(); }
+  livein_iterator livein_begin() const;
   livein_iterator livein_end()   const { return LiveIns.end(); }
   bool            livein_empty() const { return LiveIns.empty(); }
   iterator_range<livein_iterator> liveins() const {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h
index f33758de6a5..5a24b7c8729 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -10,6 +10,8 @@
 #ifndef LLVM_LIB_DEBUGINFO_DWARFDIE_H
 #define LLVM_LIB_DEBUGINFO_DWARFDIE_H
 
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
 
@@ -40,9 +42,6 @@ class DWARFDie {
   
   bool isValid() const { return U && Die; }
   explicit operator bool() const { return isValid(); }
-  bool operator ==(const DWARFDie &RHS) const {
-    return Die == RHS.Die && U == RHS.U;
-  }
   const DWARFDebugInfoEntry *getDebugInfoEntry() const { return Die; }
   DWARFUnit *getDwarfUnit() const { return U; }
 
@@ -361,8 +360,61 @@ class DWARFDie {
   getInlinedChainForAddress(const uint64_t Address,
                             SmallVectorImpl<DWARFDie> &InlinedChain) const;
 
+  class iterator;
+  
+  iterator begin() const;
+  iterator end() const;
+  iterator_range<iterator> children() const;
 };
 
+  
+inline bool operator==(const DWARFDie &LHS, const DWARFDie &RHS) {
+  return LHS.getDebugInfoEntry() == RHS.getDebugInfoEntry() &&
+      LHS.getDwarfUnit() == RHS.getDwarfUnit();
+}
+
+inline bool operator!=(const DWARFDie &LHS, const DWARFDie &RHS) {
+  return !(LHS == RHS);
+}
+
+class DWARFDie::iterator : public iterator_facade_base<iterator,
+                                                      std::forward_iterator_tag,
+                                                      const DWARFDie> {
+  DWARFDie Die;
+  void skipNull() {
+    if (Die && Die.isNULL())
+      Die = DWARFDie();
+  }
+public:
+  iterator() = default;
+  explicit iterator(DWARFDie D) : Die(D) {
+    // If we start out with only a Null DIE then invalidate.
+    skipNull();
+  }
+  iterator &operator++() {
+    Die = Die.getSibling();
+    // Don't include the NULL die when iterating.
+    skipNull();
+    return *this;
+  }
+  explicit operator bool() const { return Die.isValid(); }
+  const DWARFDie &operator*() const { return Die; }
+  bool operator==(const iterator &X) const { return Die == X.Die; }
+};
+
+// These inline functions must follow the DWARFDie::iterator definition above
+// as they use functions from that class.
+inline DWARFDie::iterator DWARFDie::begin() const {
+  return iterator(getFirstChild());
+}
+
+inline DWARFDie::iterator DWARFDie::end() const {
+  return iterator();
+}
+
+inline iterator_range<DWARFDie::iterator> DWARFDie::children() const {
+  return make_range(begin(), end());
+}
 
 } // end namespace llvm
 
diff --git a/include/llvm/ExecutionEngine/Orc/RawByteChannel.h b/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
index 43b597de000..83a7b9a844f 100644
--- a/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
+++ b/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
@@ -47,9 +47,9 @@ class RawByteChannel {
   /// Locks the channel for writing.
   template <typename FunctionIdT, typename SequenceIdT>
   Error startSendMessage(const FunctionIdT &FnId, const SequenceIdT &SeqNo) {
+    writeLock.lock();
     if (auto Err = serializeSeq(*this, FnId, SeqNo))
       return Err;
-    writeLock.lock();
     return Error::success();
   }
 
diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index 2cfe673d970..ecb0435a1e1 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -28,6 +28,10 @@
 
 namespace llvm {
 
+namespace yaml {
+template <typename T> struct MappingTraits;
+}
+
 /// \brief Class to accumulate and hold information about a callee.
 struct CalleeInfo {
   enum class HotnessType : uint8_t { Unknown = 0, Cold = 1, None = 2, Hot = 3 };
@@ -102,7 +106,7 @@ class GlobalValueSummary {
   /// \brief Sububclass discriminator (for dyn_cast<> et al.)
   enum SummaryKind : unsigned { AliasKind, FunctionKind, GlobalVarKind };
 
-  /// Group flags (Linkage, noRename, isOptSize, etc.) as a bitfield.
+  /// Group flags (Linkage, NotEligibleToImport, etc.) as a bitfield.
   struct GVFlags {
     /// \brief The linkage type of the associated global value.
     ///
@@ -113,39 +117,20 @@ class GlobalValueSummary {
     /// types based on global summary-based analysis.
     unsigned Linkage : 4;
 
-    /// Indicate if the global value cannot be renamed (in a specific section,
-    /// possibly referenced from inline assembly, etc).
-    unsigned NoRename : 1;
-
-    /// Indicate if a function contains inline assembly (which is opaque),
-    /// that may reference a local value. This is used to prevent importing
-    /// of this function, since we can't promote and rename the uses of the
-    /// local in the inline assembly. Use a flag rather than bloating the
-    /// summary with references to every possible local value in the
-    /// llvm.used set.
-    unsigned HasInlineAsmMaybeReferencingInternal : 1;
+    /// Indicate if the global value cannot be imported (e.g. it cannot
+    /// be renamed or references something that can't be renamed).
+    unsigned NotEligibleToImport : 1;
 
-    /// Indicate if the function is not viable to inline.
-    unsigned IsNotViableToInline : 1;
+    /// Indicate that the global value must be considered a live root for
+    /// index-based liveness analysis. Used for special LLVM values such as
+    /// llvm.global_ctors that the linker does not know about.
+    unsigned LiveRoot : 1;
 
     /// Convenience Constructors
-    explicit GVFlags(GlobalValue::LinkageTypes Linkage, bool NoRename,
-                     bool HasInlineAsmMaybeReferencingInternal,
-                     bool IsNotViableToInline)
-        : Linkage(Linkage), NoRename(NoRename),
-          HasInlineAsmMaybeReferencingInternal(
-              HasInlineAsmMaybeReferencingInternal),
-          IsNotViableToInline(IsNotViableToInline) {}
-
-    GVFlags(const GlobalValue &GV)
-        : Linkage(GV.getLinkage()), NoRename(GV.hasSection()),
-          HasInlineAsmMaybeReferencingInternal(false) {
-      IsNotViableToInline = false;
-      if (const auto *F = dyn_cast<Function>(&GV))
-        // Inliner doesn't handle variadic functions.
-        // FIXME: refactor this to use the same code that inliner is using.
-        IsNotViableToInline = F->isVarArg();
-    }
+    explicit GVFlags(GlobalValue::LinkageTypes Linkage,
+                     bool NotEligibleToImport, bool LiveRoot)
+        : Linkage(Linkage), NotEligibleToImport(NotEligibleToImport),
+          LiveRoot(LiveRoot) {}
   };
 
 private:
@@ -213,31 +198,19 @@ class GlobalValueSummary {
     Flags.Linkage = Linkage;
   }
 
-  bool isNotViableToInline() const { return Flags.IsNotViableToInline; }
-
-  /// Return true if this summary is for a GlobalValue that needs promotion
-  /// to be referenced from another module.
-  bool needsRenaming() const { return GlobalValue::isLocalLinkage(linkage()); }
+  /// Return true if this global value can't be imported.
+  bool notEligibleToImport() const { return Flags.NotEligibleToImport; }
 
-  /// Return true if this global value cannot be renamed (in a specific section,
-  /// possibly referenced from inline assembly, etc).
-  bool noRename() const { return Flags.NoRename; }
+  /// Return true if this global value must be considered a root for live
+  /// value analysis on the index.
+  bool liveRoot() const { return Flags.LiveRoot; }
 
-  /// Flag that this global value cannot be renamed (in a specific section,
-  /// possibly referenced from inline assembly, etc).
-  void setNoRename() { Flags.NoRename = true; }
+  /// Flag that this global value must be considered a root for live
+  /// value analysis on the index.
+  void setLiveRoot() { Flags.LiveRoot = true; }
 
-  /// Return true if this global value possibly references another value
-  /// that can't be renamed.
-  bool hasInlineAsmMaybeReferencingInternal() const {
-    return Flags.HasInlineAsmMaybeReferencingInternal;
-  }
-
-  /// Flag that this global value possibly references another value that
-  /// can't be renamed.
-  void setHasInlineAsmMaybeReferencingInternal() {
-    Flags.HasInlineAsmMaybeReferencingInternal = true;
-  }
+  /// Flag that this global value cannot be imported.
+  void setNotEligibleToImport() { Flags.NotEligibleToImport = true; }
 
   /// Return the list of values referenced by this global value definition.
   ArrayRef<ValueInfo> refs() const { return RefEdgeList; }
@@ -330,6 +303,30 @@ class GlobalVarSummary : public GlobalValueSummary {
   }
 };
 
+struct TypeTestResolution {
+  /// Specifies which kind of type check we should emit for this byte array.
+  /// See http://clang.llvm.org/docs/ControlFlowIntegrityDesign.html for full
+  /// details on each kind of check; the enumerators are described with
+  /// reference to that document.
+  enum Kind {
+    Unsat,     ///< Unsatisfiable type (i.e. no global has this type metadata)
+    ByteArray, ///< Test a byte array (first example)
+    Inline,    ///< Inlined bit vector ("Short Inline Bit Vectors")
+    Single,    ///< Single element (last example in "Short Inline Bit Vectors")
+    AllOnes,   ///< All-ones bit vector ("Eliminating Bit Vector Checks for
+               ///  All-Ones Bit Vectors")
+  } TheKind = Unsat;
+
+  /// Range of the size expressed as a bit width. For example, if the size is in
+  /// range [0,256), this number will be 8. This helps generate the most compact
+  /// instruction sequences.
+  unsigned SizeBitWidth = 0;
+};
+
+struct TypeIdSummary {
+  TypeTestResolution TTRes;
+};
+
 /// 160 bits SHA1
 typedef std::array<uint32_t, 5> ModuleHash;
 
@@ -370,11 +367,20 @@ class ModuleSummaryIndex {
   /// Holds strings for combined index, mapping to the corresponding module ID.
   ModulePathStringTableTy ModulePathStringTable;
 
+  /// Mapping from type identifiers to summary information for that type
+  /// identifier.
+  // FIXME: Add bitcode read/write support for this field.
+  std::map<std::string, TypeIdSummary> TypeIdMap;
+
+  // YAML I/O support.
+  friend yaml::MappingTraits<ModuleSummaryIndex>;
+
 public:
   gvsummary_iterator begin() { return GlobalValueMap.begin(); }
   const_gvsummary_iterator begin() const { return GlobalValueMap.begin(); }
   gvsummary_iterator end() { return GlobalValueMap.end(); }
   const_gvsummary_iterator end() const { return GlobalValueMap.end(); }
+  size_t size() const { return GlobalValueMap.size(); }
 
   /// Get the list of global value summary objects for a given value name.
   const GlobalValueSummaryList &getGlobalValueSummaryList(StringRef ValueName) {
diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h
new file mode 100644
index 00000000000..a8c8ff9ef2e
--- /dev/null
+++ b/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -0,0 +1,111 @@
+//===-- llvm/ModuleSummaryIndexYAML.h - YAML I/O for summary ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_MODULESUMMARYINDEXYAML_H
+#define LLVM_IR_MODULESUMMARYINDEXYAML_H
+
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/Support/YAMLTraits.h"
+
+namespace llvm {
+namespace yaml {
+
+template <> struct ScalarEnumerationTraits<TypeTestResolution::Kind> {
+  static void enumeration(IO &io, TypeTestResolution::Kind &value) {
+    io.enumCase(value, "Unsat", TypeTestResolution::Unsat);
+    io.enumCase(value, "ByteArray", TypeTestResolution::ByteArray);
+    io.enumCase(value, "Inline", TypeTestResolution::Inline);
+    io.enumCase(value, "Single", TypeTestResolution::Single);
+    io.enumCase(value, "AllOnes", TypeTestResolution::AllOnes);
+  }
+};
+
+template <> struct MappingTraits<TypeTestResolution> {
+  static void mapping(IO &io, TypeTestResolution &res) {
+    io.mapRequired("Kind", res.TheKind);
+    io.mapRequired("SizeBitWidth", res.SizeBitWidth);
+  }
+};
+
+template <> struct MappingTraits<TypeIdSummary> {
+  static void mapping(IO &io, TypeIdSummary& summary) {
+    io.mapRequired("TTRes", summary.TTRes);
+  }
+};
+
+struct FunctionSummaryYaml {
+  std::vector<uint64_t> TypeTests;
+};
+
+} // End yaml namespace
+} // End llvm namespace
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(uint64_t)
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<FunctionSummaryYaml> {
+  static void mapping(IO &io, FunctionSummaryYaml& summary) {
+    io.mapRequired("TypeTests", summary.TypeTests);
+  }
+};
+
+} // End yaml namespace
+} // End llvm namespace
+
+LLVM_YAML_IS_STRING_MAP(TypeIdSummary)
+LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionSummaryYaml)
+
+namespace llvm {
+namespace yaml {
+
+// FIXME: Add YAML mappings for the rest of the module summary.
+template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
+  static void inputOne(IO &io, StringRef Key, GlobalValueSummaryMapTy &V) {
+    std::vector<FunctionSummaryYaml> FSums;
+    io.mapRequired(Key.str().c_str(), FSums);
+    uint64_t KeyInt;
+    if (Key.getAsInteger(0, KeyInt)) {
+      io.setError("key not an integer");
+      return;
+    }
+    auto &Elem = V[KeyInt];
+    for (auto &FSum : FSums) {
+      GlobalValueSummary::GVFlags GVFlags(GlobalValue::ExternalLinkage, false,
+                                          false);
+      Elem.push_back(llvm::make_unique<FunctionSummary>(
+          GVFlags, 0, ArrayRef<ValueInfo>{},
+          ArrayRef<FunctionSummary::EdgeTy>{}, std::move(FSum.TypeTests)));
+    }
+  }
+  static void output(IO &io, GlobalValueSummaryMapTy &V) {
+    for (auto &P : V) {
+      std::vector<FunctionSummaryYaml> FSums;
+      for (auto &Sum : P.second) {
+        if (auto *FSum = dyn_cast<FunctionSummary>(Sum.get()))
+          FSums.push_back(FunctionSummaryYaml{FSum->type_tests()});
+      }
+      if (!FSums.empty())
+        io.mapRequired(llvm::utostr(P.first).c_str(), FSums);
+    }
+  }
+};
+
+template <> struct MappingTraits<ModuleSummaryIndex> {
+  static void mapping(IO &io, ModuleSummaryIndex& index) {
+    io.mapRequired("GlobalValueMap", index.GlobalValueMap);
+    io.mapRequired("TypeIdMap", index.TypeIdMap);
+  }
+};
+
+} // End yaml namespace
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
index 3e4edd893d3..7a63956f1cd 100644
--- a/include/llvm/IR/PassManager.h
+++ b/include/llvm/IR/PassManager.h
@@ -64,32 +64,31 @@ namespace llvm {
 struct alignas(8) AnalysisKey {};
 
 /// A special type used to provide an address that identifies a set of related
-/// analyses.
+/// analyses.  These sets are primarily used below to mark sets of analyses as
+/// preserved.
 ///
-/// These sets are primarily used below to mark sets of analyses as preserved.
-/// An example would be analyses depending only on the CFG of a function.
-/// A transformation can mark that it is preserving the CFG of a function and
-/// then analyses can check for this rather than each transform having to fully
-/// enumerate every analysis preserved.
+/// For example, a transformation can indicate that it preserves the CFG of a
+/// function by preserving the appropriate AnalysisSetKey.  An analysis that
+/// depends only on the CFG can then check if that AnalysisSetKey is preserved;
+/// if it is, the analysis knows that it itself is preserved.
 struct alignas(8) AnalysisSetKey {};
 
-/// Class for tracking what analyses are preserved after a transformation pass
-/// runs over some unit of IR.
+/// A set of analyses that are preserved following a run of a transformation
+/// pass.
 ///
-/// Transformation passes build and return these objects when run over the IR
-/// to communicate which analyses remain valid afterward. For most passes this
-/// is fairly simple: if they don't change anything all analyses are preserved,
+/// Transformation passes build and return these objects to communicate which
+/// analyses are still valid after the transformation. For most passes this is
+/// fairly simple: if they don't change anything all analyses are preserved,
 /// otherwise only a short list of analyses that have been explicitly updated
 /// are preserved.
 ///
-/// This class also provides the ability to mark abstract *sets* of analyses as
-/// preserved. These sets allow passes to indicate that they preserve broad
-/// aspects of the IR (such as its CFG) and analyses to opt in to that being
-/// sufficient without the passes having to fully enumerate such analyses.
+/// This class also lets transformation passes mark abstract *sets* of analyses
+/// as preserved. A transformation that (say) does not alter the CFG can
+/// indicate such by marking a particular AnalysisSetKey as preserved, and
+/// then analyses can query whether that AnalysisSetKey is preserved.
 ///
-/// Finally, this class can represent "abandoning" an analysis, which marks it
-/// as not-preserved even if it would be covered by some abstract set of
-/// analyses.
+/// Finally, this class can represent an "abandoned" analysis, which is
+/// not preserved even if it would be covered by some abstract set of analyses.
 ///
 /// Given a `PreservedAnalyses` object, an analysis will typically want to
 /// figure out whether it is preserved. In the example below, MyAnalysisType is
@@ -120,7 +119,8 @@ class PreservedAnalyses {
   /// Mark an analysis as preserved.
   template <typename AnalysisT> void preserve() { preserve(AnalysisT::ID()); }
 
-  /// Mark an analysis as preserved using its ID.
+  /// \brief Given an analysis's ID, mark the analysis as preserved, adding it
+  /// to the set.
   void preserve(AnalysisKey *ID) {
     // Clear this ID from the explicit not-preserved set if present.
     NotPreservedAnalysisIDs.erase(ID);
@@ -224,17 +224,17 @@ class PreservedAnalyses {
         : PA(PA), ID(ID), IsAbandoned(PA.NotPreservedAnalysisIDs.count(ID)) {}
 
   public:
-    /// Returns true if the checker's analysis was not abandoned and the
-    /// analysis is either is explicitly preserved or all analyses are
-    /// preserved.
+    /// Returns true if the checker's analysis was not abandoned and either
+    ///  - the analysis is explicitly preserved or
+    ///  - all analyses are preserved.
     bool preserved() {
       return !IsAbandoned && (PA.PreservedIDs.count(&AllAnalysesKey) ||
                               PA.PreservedIDs.count(ID));
     }
 
-    /// Returns true if the checker's analysis was not abandoned and either the
-    /// provided set type is either explicitly preserved or all analyses are
-    /// preserved.
+    /// Returns true if the checker's analysis was not abandoned and either
+    ///  - \p AnalysisSetT is explicitly preserved or
+    ///  - all analyses are preserved.
     template <typename AnalysisSetT> bool preservedSet() {
       AnalysisSetKey *SetID = AnalysisSetT::ID();
       return !IsAbandoned && (PA.PreservedIDs.count(&AllAnalysesKey) ||
@@ -262,8 +262,8 @@ class PreservedAnalyses {
 
   /// Test whether all analyses are preserved (and none are abandoned).
   ///
-  /// This lets analyses optimize for the common case where a transformation
-  /// made no changes to the IR.
+  /// This is used primarily to optimize for the common case of a transformation
+  /// which makes no changes to the IR.
   bool areAllPreserved() const {
     return NotPreservedAnalysisIDs.empty() &&
            PreservedIDs.count(&AllAnalysesKey);
@@ -307,9 +307,9 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
 /// A CRTP mix-in to automatically provide informational APIs needed for
 /// passes.
 ///
-/// This provides some boiler plate for types that are passes.
+/// This provides some boilerplate for types that are passes.
 template <typename DerivedT> struct PassInfoMixin {
-  /// Returns the name of the derived pass type.
+  /// Gets the name of the pass we are mixed into.
   static StringRef name() {
     StringRef Name = getTypeName<DerivedT>();
     if (Name.startswith("llvm::"))
@@ -318,41 +318,35 @@ template <typename DerivedT> struct PassInfoMixin {
   }
 };
 
-/// A CRTP mix-in to automatically provide informational APIs needed for
-/// analysis passes.
+/// A CRTP mix-in that provides informational APIs needed for analysis passes.
 ///
-/// This provides some boiler plate for types that are analysis passes. It
-/// automatically mixes in \c PassInfoMixin and adds informational APIs
-/// specifically used for analyses.
+/// This provides some boilerplate for types that are analysis passes. It
+/// automatically mixes in \c PassInfoMixin.
 template <typename DerivedT>
 struct AnalysisInfoMixin : PassInfoMixin<DerivedT> {
   /// Returns an opaque, unique ID for this analysis type.
   ///
-  /// This ID is a pointer type that is guaranteed to be 8-byte aligned and
-  /// thus suitable for use in sets, maps, and other data structures optimized
-  /// for pointer-like types using the alignment-provided low bits.
+  /// This ID is a pointer type that is guaranteed to be 8-byte aligned and thus
+  /// suitable for use in sets, maps, and other data structures that use the low
+  /// bits of pointers.
   ///
   /// Note that this requires the derived type provide a static \c AnalysisKey
   /// member called \c Key.
   ///
-  /// FIXME: The only reason the derived type needs to provide this rather than
-  /// this mixin providing it is due to broken implementations which cannot
-  /// correctly unique a templated static so that they have the same addresses
-  /// for each instantiation and are definitively emitted once for each
-  /// instantiation. The only currently known platform with this limitation are
-  /// Windows DLL builds, specifically building each part of LLVM as a DLL. If
-  /// we ever remove that build configuration, this mixin can provide the
-  /// static key as well.
+  /// FIXME: The only reason the mixin type itself can't declare the Key value
+  /// is that some compilers cannot correctly unique a templated static variable
+  /// so it has the same addresses in each instantiation. The only currently
+  /// known platform with this limitation is Windows DLL builds, specifically
+  /// building each part of LLVM as a DLL. If we ever remove that build
+  /// configuration, this mixin can provide the static key as well.
   static AnalysisKey *ID() { return &DerivedT::Key; }
 };
 
-/// A class template to provide analysis sets for IR units.
+/// This templated class represents "all analyses that operate over \<a
+/// particular IR unit\>" (e.g. a Function or a Module) in instances of
+/// PreservedAnalysis.
 ///
-/// Analyses operate on units of IR. It is useful to be able to talk about
-/// preservation of all analyses for a given unit of IR as a set. This class
-/// template can be used with the \c PreservedAnalyses API for that purpose and
-/// the \c AnalysisManager will automatically check and use this set to skip
-/// invalidation events.
+/// This lets a transformation say e.g. "I preserved all function analyses".
 ///
 /// Note that you must provide an explicit instantiation declaration and
 /// definition for this template in order to get the correct behavior on
@@ -371,17 +365,18 @@ template <typename IRUnitT> AnalysisSetKey AllAnalysesOn<IRUnitT>::SetKey;
 extern template class AllAnalysesOn<Module>;
 extern template class AllAnalysesOn<Function>;
 
-/// \brief Manages a sequence of passes over units of IR.
+/// \brief Manages a sequence of passes over a particular unit of IR.
 ///
-/// A pass manager contains a sequence of passes to run over units of IR. It is
-/// itself a valid pass over that unit of IR, and when over some given IR will
-/// run each pass in sequence. This is the primary and most basic building
-/// block of a pass pipeline.
+/// A pass manager contains a sequence of passes to run over a particular unit
+/// of IR (e.g. Functions, Modules). It is itself a valid pass over that unit of
+/// IR, and when run over some given IR will run each of its contained passes in
+/// sequence. Pass managers are the primary and most basic building block of a
+/// pass pipeline.
 ///
-/// If it is run with an \c AnalysisManager<IRUnitT> argument, it will propagate
-/// that analysis manager to each pass it runs, as well as calling the analysis
-/// manager's invalidation routine with the PreservedAnalyses of each pass it
-/// runs.
+/// When you run a pass manager, you provide an \c AnalysisManager<IRUnitT>
+/// argument. The pass manager will propagate that analysis manager to each
+/// pass it runs, and will call the analysis manager's invalidation routine with
+/// the PreservedAnalyses of each pass it runs.
 template <typename IRUnitT,
           typename AnalysisManagerT = AnalysisManager<IRUnitT>,
           typename... ExtraArgTs>
@@ -390,7 +385,7 @@ class PassManager : public PassInfoMixin<
 public:
   /// \brief Construct a pass manager.
   ///
-  /// It can be passed a flag to get debug logging as the passes are run.
+  /// If \p DebugLogging is true, we'll log our progress to llvm::dbgs().
   explicit PassManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {}
 
   // FIXME: These are equivalent to the default move constructor/move
@@ -400,13 +395,15 @@ class PassManager : public PassInfoMixin<
   PassManager(PassManager &&Arg)
       : Passes(std::move(Arg.Passes)),
         DebugLogging(std::move(Arg.DebugLogging)) {}
+
   PassManager &operator=(PassManager &&RHS) {
     Passes = std::move(RHS.Passes);
     DebugLogging = std::move(RHS.DebugLogging);
     return *this;
   }
 
-  /// \brief Run all of the passes in this manager over the IR.
+  /// \brief Run all of the passes in this manager over the given unit of IR.
+  /// ExtraArgs are passed to each pass.
   PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM,
                         ExtraArgTs... ExtraArgs) {
     PreservedAnalyses PA = PreservedAnalyses::all();
@@ -425,7 +422,7 @@ class PassManager : public PassInfoMixin<
       // invalidates analyses.
       AM.invalidate(IR, PassPA);
 
-      // Finally, we intersect the preserved analyses to compute the aggregate
+      // Finally, intersect the preserved analyses to compute the aggregate
       // preserved set for this pass manager.
       PA.intersect(std::move(PassPA));
 
@@ -473,30 +470,29 @@ extern template class PassManager<Function>;
 /// \brief Convenience typedef for a pass manager over functions.
 typedef PassManager<Function> FunctionPassManager;
 
-/// \brief A generic analysis pass manager with lazy running and caching of
+/// \brief A container for analyses that lazily runs them and caches their
 /// results.
 ///
-/// This analysis manager can be used for any IR unit where the address of the
-/// IR unit sufficies as its identity. It manages the cache for a unit of IR via
-/// the address of each unit of IR cached.
+/// This class can manage analyses for any IR unit where the address of the IR
+/// unit sufficies as its identity.
 template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
 public:
   class Invalidator;
 
 private:
-  // Now that we've defined our invalidator, we can build types for the concept
-  // types.
+  // Now that we've defined our invalidator, we can define the concept types.
   typedef detail::AnalysisResultConcept<IRUnitT, PreservedAnalyses, Invalidator>
       ResultConceptT;
   typedef detail::AnalysisPassConcept<IRUnitT, PreservedAnalyses, Invalidator,
                                       ExtraArgTs...>
       PassConceptT;
 
-  /// \brief List of function analysis pass IDs and associated concept pointers.
+  /// \brief List of analysis pass IDs and associated concept pointers.
   ///
   /// Requires iterators to be valid across appending new entries and arbitrary
-  /// erases. Provides the analysis ID to enable finding iterators to a given entry
-  /// in maps below, and provides the storage for the actual result concept.
+  /// erases. Provides the analysis ID to enable finding iterators to a given
+  /// entry in maps below, and provides the storage for the actual result
+  /// concept.
   typedef std::list<std::pair<AnalysisKey *, std::unique_ptr<ResultConceptT>>>
       AnalysisResultListT;
 
@@ -504,8 +500,8 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
   typedef DenseMap<IRUnitT *, AnalysisResultListT> AnalysisResultListMapT;
 
   /// \brief Map type from a pair of analysis ID and IRUnitT pointer to an
-  /// iterator into a particular result list which is where the actual result
-  /// is stored.
+  /// iterator into a particular result list (which is where the actual analysis
+  /// result is stored).
   typedef DenseMap<std::pair<AnalysisKey *, IRUnitT *>,
                    typename AnalysisResultListT::iterator>
       AnalysisResultMapT;
@@ -515,28 +511,28 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
   ///
   /// When an analysis result embeds handles to other analysis results, it
   /// needs to be invalidated both when its own information isn't preserved and
-  /// if any of those embedded analysis results end up invalidated. We pass in
-  /// an \c Invalidator object from the analysis manager in order to let the
-  /// analysis results themselves define the dependency graph on the fly. This
-  /// avoids building an explicit data structure representation of the
+  /// when any of its embedded analysis results end up invalidated. We pass an
+  /// \c Invalidator object as an argument to \c invalidate() in order to let
+  /// the analysis results themselves define the dependency graph on the fly.
+  /// This lets us avoid building building an explicit representation of the
   /// dependencies between analysis results.
   class Invalidator {
   public:
     /// Trigger the invalidation of some other analysis pass if not already
-    /// handled and return whether it will in fact be invalidated.
+    /// handled and return whether it was in fact invalidated.
     ///
     /// This is expected to be called from within a given analysis result's \c
     /// invalidate method to trigger a depth-first walk of all inter-analysis
     /// dependencies. The same \p IR unit and \p PA passed to that result's \c
     /// invalidate method should in turn be provided to this routine.
     ///
-    /// The first time this is called for a given analysis pass, it will
-    /// trigger the corresponding result's \c invalidate method to be called.
-    /// Subsequent calls will use a cache of the results of that initial call.
-    /// It is an error to form cyclic dependencies between analysis results.
+    /// The first time this is called for a given analysis pass, it will call
+    /// the corresponding result's \c invalidate method.  Subsequent calls will
+    /// use a cache of the results of that initial call.  It is an error to form
+    /// cyclic dependencies between analysis results.
     ///
-    /// This returns true if the given analysis pass's result is invalid and
-    /// any dependecies on it will become invalid as a result.
+    /// This returns true if the given analysis's result is invalid. Any
+    /// dependecies on it will become invalid as a result.
     template <typename PassT>
     bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA) {
       typedef detail::AnalysisResultModel<IRUnitT, PassT,
@@ -577,10 +573,10 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
 
       auto &Result = static_cast<ResultT &>(*RI->second->second);
 
-      // Insert into the map whether the result should be invalidated and
-      // return that. Note that we cannot re-use IMapI and must do a fresh
-      // insert here as calling the invalidate routine could (recursively)
-      // insert things into the map making any iterator or reference invalid.
+      // Insert into the map whether the result should be invalidated and return
+      // that. Note that we cannot reuse IMapI and must do a fresh insert here,
+      // as calling invalidate could (recursively) insert things into the map,
+      // making any iterator or reference invalid.
       bool Inserted;
       std::tie(IMapI, Inserted) =
           IsResultInvalidated.insert({ID, Result.invalidate(IR, PA, *this)});
@@ -600,8 +596,7 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
 
   /// \brief Construct an empty analysis manager.
   ///
-  /// A flag can be passed to indicate that the manager should perform debug
-  /// logging.
+  /// If \p DebugLogging is true, we'll log our progress to llvm::dbgs().
   AnalysisManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {}
   AnalysisManager(AnalysisManager &&) = default;
   AnalysisManager &operator=(AnalysisManager &&) = default;
@@ -614,11 +609,11 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
     return AnalysisResults.empty();
   }
 
-  /// \brief Clear any results for a single unit of IR.
+  /// \brief Clear any cached analysis results for a single unit of IR.
   ///
-  /// This doesn't invalidate but directly clears the results. It is useful
-  /// when the IR is being removed and we want to clear out all the memory
-  /// pinned for it.
+  /// This doesn't invalidate, but instead simply deletes, the relevant results.
+  /// It is useful when the IR is being removed and we want to clear out all the
+  /// memory pinned for it.
   void clear(IRUnitT &IR) {
     if (DebugLogging)
       dbgs() << "Clearing all analysis results for: " << IR.getName() << "\n";
@@ -626,7 +621,7 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
     auto ResultsListI = AnalysisResultLists.find(&IR);
     if (ResultsListI == AnalysisResultLists.end())
       return;
-    // Clear the map pointing into the results list.
+    // Delete the map entries that point into the results list.
     for (auto &IDAndResult : ResultsListI->second)
       AnalysisResults.erase({IDAndResult.first, &IR});
 
@@ -634,21 +629,20 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
     AnalysisResultLists.erase(ResultsListI);
   }
 
-  /// \brief Clear the analysis result cache.
+  /// \brief Clear all analysis results cached by this AnalysisManager.
   ///
-  /// This routine allows cleaning up when the set of IR units itself has
-  /// potentially changed, and thus we can't even look up a a result and
-  /// invalidate it directly. Notably, this does *not* call invalidate
-  /// functions as there is nothing to be done for them.
+  /// Like \c clear(IRUnitT&), this doesn't invalidate the results; it simply
+  /// deletes them.  This lets you clean up the AnalysisManager when the set of
+  /// IR units itself has potentially changed, and thus we can't even look up a
+  /// a result and invalidate/clear it directly.
   void clear() {
     AnalysisResults.clear();
     AnalysisResultLists.clear();
   }
 
-  /// \brief Get the result of an analysis pass for this module.
+  /// \brief Get the result of an analysis pass for a given IR unit.
   ///
-  /// If there is not a valid cached result in the manager already, this will
-  /// re-run the analysis to produce a valid result.
+  /// Runs the analysis if a cached result is not available.
   template <typename PassT>
   typename PassT::Result &getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs) {
     assert(AnalysisPasses.count(PassT::ID()) &&
@@ -661,7 +655,7 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
     return static_cast<ResultModelT &>(ResultConcept).Result;
   }
 
-  /// \brief Get the cached result of an analysis pass for this module.
+  /// \brief Get the cached result of an analysis pass for a given IR unit.
   ///
   /// This method never runs the analysis.
   ///
@@ -683,22 +677,21 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
 
   /// \brief Register an analysis pass with the manager.
   ///
-  /// The argument is a callable whose result is a pass. This allows passing in
-  /// a lambda to construct the pass.
+  /// The parameter is a callable whose result is an analysis pass. This allows
+  /// passing in a lambda to construct the analysis.
   ///
-  /// The pass type registered is the result type of calling the argument. If
-  /// that pass has already been registered, then the argument will not be
-  /// called and this function will return false. Otherwise, the pass type
-  /// becomes registered, with the instance provided by calling the argument
-  /// once, and this function returns true.
+  /// The analysis type to register is the type returned by calling the \c
+  /// PassBuilder argument. If that type has already been registered, then the
+  /// argument will not be called and this function will return false.
+  /// Otherwise, we register the analysis returned by calling \c PassBuilder(),
+  /// and this function returns true.
   ///
-  /// While this returns whether or not the pass type was already registered,
-  /// there in't an independent way to query that as that would be prone to
-  /// risky use when *querying* the analysis manager. Instead, the only
-  /// supported use case is avoiding duplicate registry of an analysis. This
-  /// interface also lends itself to minimizing the number of times we have to
-  /// do lookups for analyses or construct complex passes only to throw them
-  /// away.
+  /// (Note: Although the return value of this function indicates whether or not
+  /// an analysis was previously registered, there intentionally isn't a way to
+  /// query this directly.  Instead, you should just register all the analyses
+  /// you might want and let this class run them lazily.  This idiom lets us
+  /// minimize the number of times we have to look up analyses in our
+  /// hashtable.)
   template <typename PassBuilderT>
   bool registerPass(PassBuilderT &&PassBuilder) {
     typedef decltype(PassBuilder()) PassT;
@@ -718,17 +711,18 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
 
   /// \brief Invalidate a specific analysis pass for an IR module.
   ///
-  /// Note that the analysis result can disregard invalidation.
+  /// Note that the analysis result can disregard invalidation, if it determines
+  /// it is in fact still valid.
   template <typename PassT> void invalidate(IRUnitT &IR) {
     assert(AnalysisPasses.count(PassT::ID()) &&
            "This analysis pass was not registered prior to being invalidated");
     invalidateImpl(PassT::ID(), IR);
   }
 
-  /// \brief Invalidate analyses cached for an IR unit.
+  /// \brief Invalidate cached analyses for an IR unit.
   ///
   /// Walk through all of the analyses pertaining to this unit of IR and
-  /// invalidate them unless they are preserved by the PreservedAnalyses set.
+  /// invalidate them, unless they are preserved by the PreservedAnalyses set.
   void invalidate(IRUnitT &IR, const PreservedAnalyses &PA) {
     // We're done if all analyses on this IR unit are preserved.
     if (PA.allAnalysesInSetPreserved<AllAnalysesOn<IRUnitT>>())
@@ -738,8 +732,8 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
       dbgs() << "Invalidating all non-preserved analyses for: " << IR.getName()
              << "\n";
 
-    // Track whether each pass's result is invalidated. Memoize the results
-    // using the IsResultInvalidated map.
+    // Track whether each analysis's result is invalidated in
+    // IsResultInvalidated.
     SmallDenseMap<AnalysisKey *, bool, 8> IsResultInvalidated;
     Invalidator Inv(IsResultInvalidated, AnalysisResults);
     AnalysisResultListT &ResultsList = AnalysisResultLists[&IR];
@@ -758,9 +752,9 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
 
       // Try to invalidate the result, giving it the Invalidator so it can
       // recursively query for any dependencies it has and record the result.
-      // Note that we cannot re-use 'IMapI' here or pre-insert the ID as the
-      // invalidate method may insert things into the map as well, invalidating
-      // any iterator or pointer.
+      // Note that we cannot reuse 'IMapI' here or pre-insert the ID, as
+      // Result.invalidate may insert things into the map, invalidating our
+      // iterator.
       bool Inserted =
           IsResultInvalidated.insert({ID, Result.invalidate(IR, PA, Inv)})
               .second;
@@ -873,7 +867,7 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
   /// analysis result.
   AnalysisResultMapT AnalysisResults;
 
-  /// \brief A flag indicating whether debug logging is enabled.
+  /// \brief Indicates whether we log to \c llvm::dbgs().
   bool DebugLogging;
 };
 
diff --git a/include/llvm/LTO/LTO.h b/include/llvm/LTO/LTO.h
index bc435702157..78ac73a7418 100644
--- a/include/llvm/LTO/LTO.h
+++ b/include/llvm/LTO/LTO.h
@@ -382,6 +382,10 @@ class LTO {
     /// The unmangled name of the global.
     std::string IRName;
 
+    /// Keep track if the symbol is visible outside of ThinLTO (i.e. in
+    /// either a regular object or the regular LTO partition).
+    bool VisibleOutsideThinLTO = false;
+
     bool UnnamedAddr = true;
 
     /// This field keeps track of the partition number of this global. The
@@ -405,6 +409,9 @@ class LTO {
       /// This global is either used by more than one partition or has an
       /// external reference, and therefore cannot be internalized.
       External = -2u,
+
+      /// The RegularLTO partition
+      RegularLTO = 0,
     };
   };
 
diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h
index a300c4f6fb0..25642379ac9 100644
--- a/include/llvm/MC/MCTargetOptions.h
+++ b/include/llvm/MC/MCTargetOptions.h
@@ -11,6 +11,7 @@
 #define LLVM_MC_MCTARGETOPTIONS_H
 
 #include <string>
+#include <vector>
 
 namespace llvm {
 
@@ -51,11 +52,17 @@ class MCTargetOptions {
   bool PreserveAsmComments : 1;
 
   int DwarfVersion;
+
   /// getABIName - If this returns a non-empty string this represents the
   /// textual name of the ABI that we want the backend to use, e.g. o32, or
   /// aapcs-linux.
   StringRef getABIName() const;
   std::string ABIName;
+
+  /// Additional paths to search for `.include` directives when using the
+  /// integrated assembler.
+  std::vector<std::string> IASSearchPaths;
+
   MCTargetOptions();
 };
 
@@ -75,7 +82,8 @@ inline bool operator==(const MCTargetOptions &LHS, const MCTargetOptions &RHS) {
           ARE_EQUAL(ShowMCInst) &&
           ARE_EQUAL(AsmVerbose) &&
           ARE_EQUAL(DwarfVersion) &&
-          ARE_EQUAL(ABIName));
+          ARE_EQUAL(ABIName) &&
+          ARE_EQUAL(IASSearchPaths));
 #undef ARE_EQUAL
 }
 
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index 586999794d5..ad21d8af66e 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -27,7 +27,6 @@
 #ifndef LLVM_SUPPORT_FILESYSTEM_H
 #define LLVM_SUPPORT_FILESYSTEM_H
 
-#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -37,6 +36,7 @@
 #include <cassert>
 #include <cstdint>
 #include <ctime>
+#include <memory>
 #include <stack>
 #include <string>
 #include <system_error>
@@ -829,28 +829,23 @@ class directory_iterator {
 };
 
 namespace detail {
-  /// RecDirIterState - Keeps state for the recursive_directory_iterator. It is
-  /// reference counted in order to preserve InputIterator semantics on copy.
-  struct RecDirIterState : public RefCountedBase<RecDirIterState> {
-    RecDirIterState()
-      : Level(0)
-      , HasNoPushRequest(false) {}
-
+  /// Keeps state for the recursive_directory_iterator.
+  struct RecDirIterState {
     std::stack<directory_iterator, std::vector<directory_iterator>> Stack;
-    uint16_t Level;
-    bool HasNoPushRequest;
+    uint16_t Level = 0;
+    bool HasNoPushRequest = false;
   };
 } // end namespace detail
 
 /// recursive_directory_iterator - Same as directory_iterator except for it
 /// recurses down into child directories.
 class recursive_directory_iterator {
-  IntrusiveRefCntPtr<detail::RecDirIterState> State;
+  std::shared_ptr<detail::RecDirIterState> State;
 
 public:
   recursive_directory_iterator() = default;
   explicit recursive_directory_iterator(const Twine &path, std::error_code &ec)
-      : State(new detail::RecDirIterState) {
+      : State(std::make_shared<detail::RecDirIterState>()) {
     State->Stack.push(directory_iterator(path, ec));
     if (State->Stack.top() == directory_iterator())
       State.reset();
diff --git a/include/llvm/Support/TarWriter.h b/include/llvm/Support/TarWriter.h
new file mode 100644
index 00000000000..44bdcaf2c46
--- /dev/null
+++ b/include/llvm/Support/TarWriter.h
@@ -0,0 +1,32 @@
+//===-- llvm/Support/TarWriter.h - Tar archive file creator -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_TAR_WRITER_H
+#define LLVM_SUPPORT_TAR_WRITER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+class TarWriter {
+public:
+  static Expected<std::unique_ptr<TarWriter>> create(StringRef OutputPath,
+                                                     StringRef BaseDir);
+
+  void append(StringRef Path, StringRef Data);
+
+private:
+  TarWriter(int FD, StringRef BaseDir);
+  raw_fd_ostream OS;
+  std::string BaseDir;
+};
+}
+
+#endif
diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index d7acbe883c5..eaea092c917 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -86,11 +86,15 @@ class FunctionImportPass : public PassInfoMixin<FunctionImportPass> {
 /// \p ExportLists contains for each Module the set of globals (GUID) that will
 /// be imported by another module, or referenced by such a function. I.e. this
 /// is the set of globals that need to be promoted/renamed appropriately.
+///
+/// \p DeadSymbols (optional) contains a list of GUID that are deemed "dead" and
+/// will be ignored for the purpose of importing.
 void ComputeCrossModuleImport(
     const ModuleSummaryIndex &Index,
     const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     StringMap<FunctionImporter::ImportMapTy> &ImportLists,
-    StringMap<FunctionImporter::ExportSetTy> &ExportLists);
+    StringMap<FunctionImporter::ExportSetTy> &ExportLists,
+    const DenseSet<GlobalValue::GUID> *DeadSymbols = nullptr);
 
 /// Compute all the imports for the given module using the Index.
 ///
@@ -100,6 +104,13 @@ void ComputeCrossModuleImportForModule(
     StringRef ModulePath, const ModuleSummaryIndex &Index,
     FunctionImporter::ImportMapTy &ImportList);
 
+/// Compute all the symbols that are "dead": i.e these that can't be reached
+/// in the graph from any of the given symbols listed in
+/// \p GUIDPreservedSymbols.
+DenseSet<GlobalValue::GUID>
+computeDeadSymbols(const ModuleSummaryIndex &Index,
+                   const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols);
+
 /// Compute the set of summaries needed for a ThinLTO backend compilation of
 /// \p ModulePath.
 //
diff --git a/include/llvm/Transforms/IPO/LowerTypeTests.h b/include/llvm/Transforms/IPO/LowerTypeTests.h
index 23c59c199a3..ca6e1b878df 100644
--- a/include/llvm/Transforms/IPO/LowerTypeTests.h
+++ b/include/llvm/Transforms/IPO/LowerTypeTests.h
@@ -60,10 +60,6 @@ struct BitSetInfo {
 
   bool containsGlobalOffset(uint64_t Offset) const;
 
-  bool containsValue(const DataLayout &DL,
-                     const DenseMap<GlobalObject *, uint64_t> &GlobalLayout,
-                     Value *V, uint64_t COffset = 0) const;
-
   void print(raw_ostream &OS) const;
 };
 
diff --git a/include/llvm/Transforms/Utils/FunctionImportUtils.h b/include/llvm/Transforms/Utils/FunctionImportUtils.h
index 57b7d0fcd7c..f18cd92310b 100644
--- a/include/llvm/Transforms/Utils/FunctionImportUtils.h
+++ b/include/llvm/Transforms/Utils/FunctionImportUtils.h
@@ -40,9 +40,20 @@ class FunctionImportGlobalProcessing {
   /// as part of a different backend compilation process.
   bool HasExportedFunctions = false;
 
+  /// Set of llvm.*used values, in order to validate that we don't try
+  /// to promote any non-renamable values.
+  SmallPtrSet<GlobalValue *, 8> Used;
+
   /// Check if we should promote the given local value to global scope.
   bool shouldPromoteLocalToGlobal(const GlobalValue *SGV);
 
+#ifndef NDEBUG
+  /// Check if the given value is a local that can't be renamed (promoted).
+  /// Only used in assertion checking, and disabled under NDEBUG since the Used
+  /// set will not be populated.
+  bool isNonRenamableLocal(const GlobalValue &GV) const;
+#endif
+
   /// Helper methods to check if we are importing from or potentially
   /// exporting from the current source module.
   bool isPerformingImport() const { return GlobalsToImport != nullptr; }
@@ -82,6 +93,13 @@ class FunctionImportGlobalProcessing {
     // may be exported to another backend compilation.
     if (!GlobalsToImport)
       HasExportedFunctions = ImportIndex.hasExportedFunctions(M);
+
+#ifndef NDEBUG
+    // First collect those in the llvm.used set.
+    collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
+    // Next collect those in the llvm.compiler.used set.
+    collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ true);
+#endif
   }
 
   bool run();
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index a86bc7e7fcb..29e6d66b27f 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -151,6 +151,7 @@ module LLVM_intrinsic_gen {
   module IR_NoFolder { header "IR/NoFolder.h" export * }
   module IR_Module { header "IR/Module.h" export * }
   module IR_ModuleSummaryIndex { header "IR/ModuleSummaryIndex.h" export * }
+  module IR_ModuleSummaryIndexYAML { header "IR/ModuleSummaryIndexYAML.h" export * }
   module IR_Function { header "IR/Function.h" export * }
   module IR_InstrTypes { header "IR/InstrTypes.h" export * }
   module IR_Instructions { header "IR/Instructions.h" export * }
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index 1d2ffc1abe1..6387bb36166 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -80,10 +80,15 @@ static CalleeInfo::HotnessType getHotness(uint64_t ProfileCount,
   return CalleeInfo::HotnessType::None;
 }
 
-static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
-                                   const Function &F, BlockFrequencyInfo *BFI,
-                                   ProfileSummaryInfo *PSI,
-                                   bool HasLocalsInUsed) {
+static bool isNonRenamableLocal(const GlobalValue &GV) {
+  return GV.hasSection() && GV.hasLocalLinkage();
+}
+
+static void
+computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
+                       const Function &F, BlockFrequencyInfo *BFI,
+                       ProfileSummaryInfo *PSI, bool HasLocalsInUsed,
+                       DenseSet<GlobalValue::GUID> &CantBePromoted) {
   // Summary not currently supported for anonymous functions, they should
   // have been named.
   assert(F.hasName());
@@ -178,37 +183,64 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       }
     }
 
-  GlobalValueSummary::GVFlags Flags(F);
+  bool NonRenamableLocal = isNonRenamableLocal(F);
+  bool NotEligibleForImport =
+      NonRenamableLocal || HasInlineAsmMaybeReferencingInternal ||
+      // Inliner doesn't handle variadic functions.
+      // FIXME: refactor this to use the same code that inliner is using.
+      F.isVarArg();
+  GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport,
+                                    /* LiveRoot = */ false);
   auto FuncSummary = llvm::make_unique<FunctionSummary>(
       Flags, NumInsts, RefEdges.takeVector(), CallGraphEdges.takeVector(),
       TypeTests.takeVector());
-  if (HasInlineAsmMaybeReferencingInternal)
-    FuncSummary->setHasInlineAsmMaybeReferencingInternal();
+  if (NonRenamableLocal)
+    CantBePromoted.insert(F.getGUID());
   Index.addGlobalValueSummary(F.getName(), std::move(FuncSummary));
 }
 
-static void computeVariableSummary(ModuleSummaryIndex &Index,
-                                   const GlobalVariable &V) {
+static void
+computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
+                       DenseSet<GlobalValue::GUID> &CantBePromoted) {
   SetVector<ValueInfo> RefEdges;
   SmallPtrSet<const User *, 8> Visited;
   findRefEdges(&V, RefEdges, Visited);
-  GlobalValueSummary::GVFlags Flags(V);
+  bool NonRenamableLocal = isNonRenamableLocal(V);
+  GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal,
+                                    /* LiveRoot = */ false);
   auto GVarSummary =
       llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector());
+  if (NonRenamableLocal)
+    CantBePromoted.insert(V.getGUID());
   Index.addGlobalValueSummary(V.getName(), std::move(GVarSummary));
 }
 
-static void computeAliasSummary(ModuleSummaryIndex &Index,
-                                const GlobalAlias &A) {
-  GlobalValueSummary::GVFlags Flags(A);
+static void
+computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
+                    DenseSet<GlobalValue::GUID> &CantBePromoted) {
+  bool NonRenamableLocal = isNonRenamableLocal(A);
+  GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal,
+                                    /* LiveRoot = */ false);
   auto AS = llvm::make_unique<AliasSummary>(Flags, ArrayRef<ValueInfo>{});
   auto *Aliasee = A.getBaseObject();
   auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee);
   assert(AliaseeSummary && "Alias expects aliasee summary to be parsed");
   AS->setAliasee(AliaseeSummary);
+  if (NonRenamableLocal)
+    CantBePromoted.insert(A.getGUID());
   Index.addGlobalValueSummary(A.getName(), std::move(AS));
 }
 
+// Set LiveRoot flag on entries matching the given value name.
+static void setLiveRoot(ModuleSummaryIndex &Index, StringRef Name) {
+  auto SummaryList =
+      Index.findGlobalValueSummaryList(GlobalValue::getGUID(Name));
+  if (SummaryList == Index.end())
+    return;
+  for (auto &Summary : SummaryList->second)
+    Summary->setLiveRoot();
+}
+
 ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     const Module &M,
     std::function<BlockFrequencyInfo *(const Function &F)> GetBFICallback,
@@ -226,9 +258,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
   collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
   // Next collect those in the llvm.compiler.used set.
   collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ true);
+  DenseSet<GlobalValue::GUID> CantBePromoted;
   for (auto *V : Used) {
-    if (V->hasLocalLinkage())
+    if (V->hasLocalLinkage()) {
       LocalsUsed.insert(V);
+      CantBePromoted.insert(V->getGUID());
+    }
   }
 
   // Compute summaries for all functions defined in module, and save in the
@@ -248,7 +283,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
       BFI = BFIPtr.get();
     }
 
-    computeFunctionSummary(Index, M, F, BFI, PSI, !LocalsUsed.empty());
+    computeFunctionSummary(Index, M, F, BFI, PSI, !LocalsUsed.empty(),
+                           CantBePromoted);
   }
 
   // Compute summaries for all variables defined in module, and save in the
@@ -256,20 +292,29 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
   for (const GlobalVariable &G : M.globals()) {
     if (G.isDeclaration())
       continue;
-    computeVariableSummary(Index, G);
+    computeVariableSummary(Index, G, CantBePromoted);
   }
 
   // Compute summaries for all aliases defined in module, and save in the
   // index.
   for (const GlobalAlias &A : M.aliases())
-    computeAliasSummary(Index, A);
+    computeAliasSummary(Index, A, CantBePromoted);
 
   for (auto *V : LocalsUsed) {
     auto *Summary = Index.getGlobalValueSummary(*V);
     assert(Summary && "Missing summary for global value");
-    Summary->setNoRename();
+    Summary->setNotEligibleToImport();
   }
 
+  // The linker doesn't know about these LLVM produced values, so we need
+  // to flag them as live in the index to ensure index-based dead value
+  // analysis treats them as live roots of the analysis.
+  setLiveRoot(Index, "llvm.used");
+  setLiveRoot(Index, "llvm.compiler.used");
+  setLiveRoot(Index, "llvm.global_ctors");
+  setLiveRoot(Index, "llvm.global_dtors");
+  setLiveRoot(Index, "llvm.global.annotations");
+
   if (!M.getModuleInlineAsm().empty()) {
     // Collect the local values defined by module level asm, and set up
     // summaries for these symbols so that they can be marked as NoRename,
@@ -282,7 +327,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     // referenced from there.
     ModuleSymbolTable::CollectAsmSymbols(
         Triple(M.getTargetTriple()), M.getModuleInlineAsm(),
-        [&M, &Index](StringRef Name, object::BasicSymbolRef::Flags Flags) {
+        [&M, &Index, &CantBePromoted](StringRef Name,
+                                      object::BasicSymbolRef::Flags Flags) {
           // Symbols not marked as Weak or Global are local definitions.
           if (Flags & (object::BasicSymbolRef::SF_Weak |
                        object::BasicSymbolRef::SF_Global))
@@ -291,11 +337,10 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
           if (!GV)
             return;
           assert(GV->isDeclaration() && "Def in module asm already has definition");
-          GlobalValueSummary::GVFlags GVFlags(
-              GlobalValue::InternalLinkage,
-              /* NoRename */ true,
-              /* HasInlineAsmMaybeReferencingInternal */ false,
-              /* IsNotViableToInline */ true);
+          GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage,
+                                              /* NotEligibleToImport */ true,
+                                              /* LiveRoot */ true);
+          CantBePromoted.insert(GlobalValue::getGUID(Name));
           // Create the appropriate summary type.
           if (isa<Function>(GV)) {
             std::unique_ptr<FunctionSummary> Summary =
@@ -303,18 +348,41 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                     GVFlags, 0, ArrayRef<ValueInfo>{},
                     ArrayRef<FunctionSummary::EdgeTy>{},
                     ArrayRef<GlobalValue::GUID>{});
-            Summary->setNoRename();
             Index.addGlobalValueSummary(Name, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
                 llvm::make_unique<GlobalVarSummary>(GVFlags,
                                                     ArrayRef<ValueInfo>{});
-            Summary->setNoRename();
             Index.addGlobalValueSummary(Name, std::move(Summary));
           }
         });
   }
 
+  for (auto &GlobalList : Index) {
+    assert(GlobalList.second.size() == 1 &&
+           "Expected module's index to have one summary per GUID");
+    auto &Summary = GlobalList.second[0];
+    bool AllRefsCanBeExternallyReferenced =
+        llvm::all_of(Summary->refs(), [&](const ValueInfo &VI) {
+          return !CantBePromoted.count(VI.getValue()->getGUID());
+        });
+    if (!AllRefsCanBeExternallyReferenced) {
+      Summary->setNotEligibleToImport();
+      continue;
+    }
+
+    if (auto *FuncSummary = dyn_cast<FunctionSummary>(Summary.get())) {
+      bool AllCallsCanBeExternallyReferenced = llvm::all_of(
+          FuncSummary->calls(), [&](const FunctionSummary::EdgeTy &Edge) {
+            auto GUID = Edge.first.isGUID() ? Edge.first.getGUID()
+                                            : Edge.first.getValue()->getGUID();
+            return !CantBePromoted.count(GUID);
+          });
+      if (!AllCallsCanBeExternallyReferenced)
+        Summary->setNotEligibleToImport();
+    }
+  }
+
   return Index;
 }
 
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 2a15b9b264e..cd8c24630df 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -389,8 +389,9 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
 }
 
 int TargetTransformInfo::getAddressComputationCost(Type *Tp,
-                                                   bool IsComplex) const {
-  int Cost = TTIImpl->getAddressComputationCost(Tp, IsComplex);
+                                                   ScalarEvolution *SE,
+                                                   const SCEV *Ptr) const {
+  int Cost = TTIImpl->getAddressComputationCost(Tp, SE, Ptr);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 03aefcf5711..d9e249aad21 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -801,12 +801,12 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags,
   // to getDecodedLinkage() will need to be taken into account here as above.
   auto Linkage = GlobalValue::LinkageTypes(RawFlags & 0xF); // 4 bits
   RawFlags = RawFlags >> 4;
-  bool NoRename = RawFlags & 0x1;
-  bool IsNotViableToInline = RawFlags & 0x2;
-  bool HasInlineAsmMaybeReferencingInternal = RawFlags & 0x4;
-  return GlobalValueSummary::GVFlags(Linkage, NoRename,
-                                     HasInlineAsmMaybeReferencingInternal,
-                                     IsNotViableToInline);
+  bool NotEligibleToImport = (RawFlags & 0x1) || Version < 3;
+  // The LiveRoot flag wasn't introduced until version 3. For dead stripping
+  // to work correctly on earlier versions, we must conservatively treat all
+  // values as live.
+  bool LiveRoot = (RawFlags & 0x2) || Version < 3;
+  return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, LiveRoot);
 }
 
 static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) {
@@ -4838,9 +4838,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
   }
   const uint64_t Version = Record[0];
   const bool IsOldProfileFormat = Version == 1;
-  if (!IsOldProfileFormat && Version != 2)
+  if (Version < 1 || Version > 3)
     return error("Invalid summary version " + Twine(Version) +
-                 ", 1 or 2 expected");
+                 ", 1, 2 or 3 expected");
   Record.clear();
 
   // Keep around the last seen summary to be used when we see an optional
diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp
index 43c9aebd79e..771cf3d927b 100644
--- a/lib/Bitcode/Reader/BitstreamReader.cpp
+++ b/lib/Bitcode/Reader/BitstreamReader.cpp
@@ -93,20 +93,29 @@ static void skipAbbreviatedField(BitstreamCursor &Cursor,
 }
 
 /// skipRecord - Read the current record and discard it.
-void BitstreamCursor::skipRecord(unsigned AbbrevID) {
+unsigned BitstreamCursor::skipRecord(unsigned AbbrevID) {
   // Skip unabbreviated records by reading past their entries.
   if (AbbrevID == bitc::UNABBREV_RECORD) {
     unsigned Code = ReadVBR(6);
-    (void)Code;
     unsigned NumElts = ReadVBR(6);
     for (unsigned i = 0; i != NumElts; ++i)
       (void)ReadVBR64(6);
-    return;
+    return Code;
   }
 
   const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID);
+  const BitCodeAbbrevOp &CodeOp = Abbv->getOperandInfo(0);
+  unsigned Code;
+  if (CodeOp.isLiteral())
+    Code = CodeOp.getLiteralValue();
+  else {
+    if (CodeOp.getEncoding() == BitCodeAbbrevOp::Array ||
+        CodeOp.getEncoding() == BitCodeAbbrevOp::Blob)
+      report_fatal_error("Abbreviation starts with an Array or a Blob");
+    Code = readAbbreviatedField(*this, CodeOp);
+  }
 
-  for (unsigned i = 0, e = Abbv->getNumOperandInfos(); i != e; ++i) {
+  for (unsigned i = 1, e = Abbv->getNumOperandInfos(); i < e; ++i) {
     const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
     if (Op.isLiteral())
       continue;
@@ -164,6 +173,7 @@ void BitstreamCursor::skipRecord(unsigned AbbrevID) {
     // Skip over the blob.
     JumpToBit(NewEnd);
   }
+  return Code;
 }
 
 unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
@@ -273,7 +283,7 @@ unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
 }
 
 void BitstreamCursor::ReadAbbrevRecord() {
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   unsigned NumOpInfo = ReadVBR(5);
   for (unsigned i = 0; i != NumOpInfo; ++i) {
     bool IsLiteral = Read(1);
@@ -307,7 +317,7 @@ void BitstreamCursor::ReadAbbrevRecord() {
 
   if (Abbv->getNumOperandInfos() == 0)
     report_fatal_error("Abbrev record with no operands");
-  CurAbbrevs.push_back(Abbv);
+  CurAbbrevs.push_back(std::move(Abbv));
 }
 
 Optional<BitstreamBlockInfo>
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index 5da421a79b7..460d39cc28d 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -14,10 +14,12 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
@@ -86,12 +88,23 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "bitcode-reader"
+
+STATISTIC(NumMDStringLoaded, "Number of MDStrings loaded");
+STATISTIC(NumMDNodeTemporary, "Number of MDNode::Temporary created");
+STATISTIC(NumMDRecordLoaded, "Number of Metadata records loaded");
+
 /// Flag whether we need to import full type definitions for ThinLTO.
 /// Currently needed for Darwin and LLDB.
 static cl::opt<bool> ImportFullTypeDefinitions(
     "import-full-type-definitions", cl::init(false), cl::Hidden,
     cl::desc("Import full type definitions for ThinLTO."));
 
+static cl::opt<bool> DisableLazyLoading(
+    "disable-ondemand-mds-loading", cl::init(false), cl::Hidden,
+    cl::desc("Force disable the lazy-loading on-demand of metadata when "
+             "loading bitcode for importing."));
+
 namespace {
 
 static int64_t unrotateSign(uint64_t U) { return U & 1 ? ~(U >> 1) : U >> 1; }
@@ -165,6 +178,10 @@ class BitcodeReaderMetadataList {
   void assignValue(Metadata *MD, unsigned Idx);
   void tryToResolveCycles();
   bool hasFwdRefs() const { return !ForwardReference.empty(); }
+  int getNextFwdRef() {
+    assert(hasFwdRefs());
+    return *ForwardReference.begin();
+  }
 
   /// Upgrade a type that had an MDString reference.
   void addTypeRef(MDString &UUID, DICompositeType &CT);
@@ -215,6 +232,7 @@ Metadata *BitcodeReaderMetadataList::getMetadataFwdRef(unsigned Idx) {
   ForwardReference.insert(Idx);
 
   // Create and return a placeholder, which will later be RAUW'd.
+  ++NumMDNodeTemporary;
   Metadata *MD = MDNode::getTemporary(Context, None).release();
   MetadataPtrs[Idx].reset(MD);
   return MD;
@@ -340,8 +358,26 @@ class PlaceholderQueue {
   std::deque<DistinctMDOperandPlaceholder> PHs;
 
 public:
+  bool empty() { return PHs.empty(); }
   DistinctMDOperandPlaceholder &getPlaceholderOp(unsigned ID);
   void flush(BitcodeReaderMetadataList &MetadataList);
+
+  /// Return the list of temporaries nodes in the queue, these need to be
+  /// loaded before we can flush the queue.
+  void getTemporaries(BitcodeReaderMetadataList &MetadataList,
+                      DenseSet<unsigned> &Temporaries) {
+    for (auto &PH : PHs) {
+      auto ID = PH.getID();
+      auto *MD = MetadataList.lookup(ID);
+      if (!MD) {
+        Temporaries.insert(ID);
+        continue;
+      }
+      auto *N = dyn_cast_or_null<MDNode>(MD);
+      if (N && N->isTemporary())
+        Temporaries.insert(ID);
+    }
+  }
 };
 
 } // end anonymous namespace
@@ -375,6 +411,30 @@ class MetadataLoader::MetadataLoaderImpl {
   Module &TheModule;
   std::function<Type *(unsigned)> getTypeByID;
 
+  /// Cursor associated with the lazy-loading of Metadata. This is the easy way
+  /// to keep around the right "context" (Abbrev list) to be able to jump in
+  /// the middle of the metadata block and load any record.
+  BitstreamCursor IndexCursor;
+
+  /// Index that keeps track of MDString values.
+  std::vector<StringRef> MDStringRef;
+
+  /// On-demand loading of a single MDString. Requires the index above to be
+  /// populated.
+  MDString *lazyLoadOneMDString(unsigned Idx);
+
+  /// Index that keeps track of where to find a metadata record in the stream.
+  std::vector<uint64_t> GlobalMetadataBitPosIndex;
+
+  /// Populate the index above to enable lazily loading of metadata, and load
+  /// the named metadata as well as the transitively referenced global
+  /// Metadata.
+  Expected<bool> lazyLoadModuleMetadataBlock(PlaceholderQueue &Placeholders);
+
+  /// On-demand loading of a single metadata. Requires the index above to be
+  /// populated.
+  void lazyLoadOneMetadata(unsigned Idx, PlaceholderQueue &Placeholders);
+
   // Keep mapping of seens pair of old-style CU <-> SP, and update pointers to
   // point from SP to CU after a block is completly parsed.
   std::vector<std::pair<DICompileUnit *, Metadata *>> CUSubprograms;
@@ -394,13 +454,25 @@ class MetadataLoader::MetadataLoaderImpl {
 
   Error parseOneMetadata(SmallVectorImpl<uint64_t> &Record, unsigned Code,
                          PlaceholderQueue &Placeholders, StringRef Blob,
-                         bool ModuleLevel, unsigned &NextMetadataNo);
+                         unsigned &NextMetadataNo);
   Error parseMetadataStrings(ArrayRef<uint64_t> Record, StringRef Blob,
-                             unsigned &NextMetadataNo);
+                             std::function<void(StringRef)> CallBack);
   Error parseGlobalObjectAttachment(GlobalObject &GO,
                                     ArrayRef<uint64_t> Record);
   Error parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record);
 
+  void resolveForwardRefsAndPlaceholders(PlaceholderQueue &Placeholders);
+
+  /// Upgrade old-style CU <-> SP pointers to point from SP to CU.
+  void upgradeCUSubprograms() {
+    for (auto CU_SP : CUSubprograms)
+      if (auto *SPs = dyn_cast_or_null<MDTuple>(CU_SP.second))
+        for (auto &Op : SPs->operands())
+          if (auto *SP = dyn_cast_or_null<MDNode>(Op))
+            SP->replaceOperandWith(7, CU_SP.first);
+    CUSubprograms.clear();
+  }
+
 public:
   MetadataLoaderImpl(BitstreamCursor &Stream, Module &TheModule,
                      BitcodeReaderValueList &ValueList,
@@ -444,20 +516,217 @@ Error error(const Twine &Message) {
       Message, make_error_code(BitcodeError::CorruptedBitcode));
 }
 
+Expected<bool> MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock(
+    PlaceholderQueue &Placeholders) {
+  IndexCursor = Stream;
+  SmallVector<uint64_t, 64> Record;
+  // Get the abbrevs, and preload record positions to make them lazy-loadable.
+  while (true) {
+    BitstreamEntry Entry = IndexCursor.advanceSkippingSubblocks(
+        BitstreamCursor::AF_DontPopBlockAtEnd);
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock: {
+      return true;
+    }
+    case BitstreamEntry::Record: {
+      // The interesting case.
+      ++NumMDRecordLoaded;
+      uint64_t CurrentPos = IndexCursor.GetCurrentBitNo();
+      auto Code = IndexCursor.skipRecord(Entry.ID);
+      switch (Code) {
+      case bitc::METADATA_STRINGS: {
+        // Rewind and parse the strings.
+        IndexCursor.JumpToBit(CurrentPos);
+        StringRef Blob;
+        Record.clear();
+        IndexCursor.readRecord(Entry.ID, Record, &Blob);
+        unsigned NumStrings = Record[0];
+        MDStringRef.reserve(NumStrings);
+        auto IndexNextMDString = [&](StringRef Str) {
+          MDStringRef.push_back(Str);
+        };
+        if (auto Err = parseMetadataStrings(Record, Blob, IndexNextMDString))
+          return std::move(Err);
+        break;
+      }
+      case bitc::METADATA_INDEX_OFFSET: {
+        // This is the offset to the index, when we see this we skip all the
+        // records and load only an index to these.
+        IndexCursor.JumpToBit(CurrentPos);
+        Record.clear();
+        IndexCursor.readRecord(Entry.ID, Record);
+        if (Record.size() != 2)
+          return error("Invalid record");
+        auto Offset = Record[0] + (Record[1] << 32);
+        auto BeginPos = IndexCursor.GetCurrentBitNo();
+        IndexCursor.JumpToBit(BeginPos + Offset);
+        Entry = IndexCursor.advanceSkippingSubblocks(
+            BitstreamCursor::AF_DontPopBlockAtEnd);
+        assert(Entry.Kind == BitstreamEntry::Record &&
+               "Corrupted bitcode: Expected `Record` when trying to find the "
+               "Metadata index");
+        Record.clear();
+        auto Code = IndexCursor.readRecord(Entry.ID, Record);
+        (void)Code;
+        assert(Code == bitc::METADATA_INDEX && "Corrupted bitcode: Expected "
+                                               "`METADATA_INDEX` when trying "
+                                               "to find the Metadata index");
+
+        // Delta unpack
+        auto CurrentValue = BeginPos;
+        GlobalMetadataBitPosIndex.reserve(Record.size());
+        for (auto &Elt : Record) {
+          CurrentValue += Elt;
+          GlobalMetadataBitPosIndex.push_back(CurrentValue);
+        }
+        break;
+      }
+      case bitc::METADATA_INDEX:
+        // We don't expect to get there, the Index is loaded when we encounter
+        // the offset.
+        return error("Corrupted Metadata block");
+      case bitc::METADATA_NAME: {
+        // Named metadata need to be materialized now and aren't deferred.
+        IndexCursor.JumpToBit(CurrentPos);
+        Record.clear();
+        unsigned Code = IndexCursor.readRecord(Entry.ID, Record);
+        assert(Code == bitc::METADATA_NAME);
+
+        // Read name of the named metadata.
+        SmallString<8> Name(Record.begin(), Record.end());
+        Code = IndexCursor.ReadCode();
+
+        // Named Metadata comes in two parts, we expect the name to be followed
+        // by the node
+        Record.clear();
+        unsigned NextBitCode = IndexCursor.readRecord(Code, Record);
+        assert(NextBitCode == bitc::METADATA_NAMED_NODE);
+        (void)NextBitCode;
+
+        // Read named metadata elements.
+        unsigned Size = Record.size();
+        NamedMDNode *NMD = TheModule.getOrInsertNamedMetadata(Name);
+        for (unsigned i = 0; i != Size; ++i) {
+          // FIXME: We could use a placeholder here, however NamedMDNode are
+          // taking MDNode as operand and not using the Metadata infrastructure.
+          // It is acknowledged by 'TODO: Inherit from Metadata' in the
+          // NamedMDNode class definition.
+          MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]);
+          assert(MD && "Invalid record");
+          NMD->addOperand(MD);
+        }
+        break;
+      }
+      case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: {
+        // FIXME: we need to do this early because we don't materialize global
+        // value explicitly.
+        IndexCursor.JumpToBit(CurrentPos);
+        Record.clear();
+        IndexCursor.readRecord(Entry.ID, Record);
+        if (Record.size() % 2 == 0)
+          return error("Invalid record");
+        unsigned ValueID = Record[0];
+        if (ValueID >= ValueList.size())
+          return error("Invalid record");
+        if (auto *GO = dyn_cast<GlobalObject>(ValueList[ValueID]))
+          if (Error Err = parseGlobalObjectAttachment(
+                  *GO, ArrayRef<uint64_t>(Record).slice(1)))
+            return std::move(Err);
+        break;
+      }
+      case bitc::METADATA_KIND:
+      case bitc::METADATA_STRING_OLD:
+      case bitc::METADATA_OLD_FN_NODE:
+      case bitc::METADATA_OLD_NODE:
+      case bitc::METADATA_VALUE:
+      case bitc::METADATA_DISTINCT_NODE:
+      case bitc::METADATA_NODE:
+      case bitc::METADATA_LOCATION:
+      case bitc::METADATA_GENERIC_DEBUG:
+      case bitc::METADATA_SUBRANGE:
+      case bitc::METADATA_ENUMERATOR:
+      case bitc::METADATA_BASIC_TYPE:
+      case bitc::METADATA_DERIVED_TYPE:
+      case bitc::METADATA_COMPOSITE_TYPE:
+      case bitc::METADATA_SUBROUTINE_TYPE:
+      case bitc::METADATA_MODULE:
+      case bitc::METADATA_FILE:
+      case bitc::METADATA_COMPILE_UNIT:
+      case bitc::METADATA_SUBPROGRAM:
+      case bitc::METADATA_LEXICAL_BLOCK:
+      case bitc::METADATA_LEXICAL_BLOCK_FILE:
+      case bitc::METADATA_NAMESPACE:
+      case bitc::METADATA_MACRO:
+      case bitc::METADATA_MACRO_FILE:
+      case bitc::METADATA_TEMPLATE_TYPE:
+      case bitc::METADATA_TEMPLATE_VALUE:
+      case bitc::METADATA_GLOBAL_VAR:
+      case bitc::METADATA_LOCAL_VAR:
+      case bitc::METADATA_EXPRESSION:
+      case bitc::METADATA_OBJC_PROPERTY:
+      case bitc::METADATA_IMPORTED_ENTITY:
+      case bitc::METADATA_GLOBAL_VAR_EXPR:
+        // We don't expect to see any of these, if we see one, give up on
+        // lazy-loading and fallback.
+        MDStringRef.clear();
+        GlobalMetadataBitPosIndex.clear();
+        return false;
+      }
+      break;
+    }
+    }
+  }
+}
+
 /// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing
 /// module level metadata.
 Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
   if (!ModuleLevel && MetadataList.hasFwdRefs())
     return error("Invalid metadata: fwd refs into function blocks");
 
+  // Record the entry position so that we can jump back here and efficiently
+  // skip the whole block in case we lazy-load.
+  auto EntryPos = Stream.GetCurrentBitNo();
+
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
     return error("Invalid record");
 
-  unsigned NextMetadataNo = MetadataList.size();
   SmallVector<uint64_t, 64> Record;
-
   PlaceholderQueue Placeholders;
 
+  // We lazy-load module-level metadata: we build an index for each record, and
+  // then load individual record as needed, starting with the named metadata.
+  if (ModuleLevel && IsImporting && MetadataList.empty() &&
+      !DisableLazyLoading) {
+    auto SuccessOrErr = lazyLoadModuleMetadataBlock(Placeholders);
+    if (!SuccessOrErr)
+      return SuccessOrErr.takeError();
+    if (SuccessOrErr.get()) {
+      // An index was successfully created and we will be able to load metadata
+      // on-demand.
+      MetadataList.resize(MDStringRef.size() +
+                          GlobalMetadataBitPosIndex.size());
+
+      // Reading the named metadata created forward references and/or
+      // placeholders, that we flush here.
+      resolveForwardRefsAndPlaceholders(Placeholders);
+      upgradeCUSubprograms();
+      // Return at the beginning of the block, since it is easy to skip it
+      // entirely from there.
+      Stream.ReadBlockEnd(); // Pop the abbrev block context.
+      Stream.JumpToBit(EntryPos);
+      if (Stream.SkipBlock())
+        return error("Invalid record");
+      return Error::success();
+    }
+    // Couldn't load an index, fallback to loading all the block "old-style".
+  }
+
+  unsigned NextMetadataNo = MetadataList.size();
+
   // Read all the records.
   while (true) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
@@ -467,16 +736,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
-      // Upgrade old-style CU <-> SP pointers to point from SP to CU.
-      for (auto CU_SP : CUSubprograms)
-        if (auto *SPs = dyn_cast_or_null<MDTuple>(CU_SP.second))
-          for (auto &Op : SPs->operands())
-            if (auto *SP = dyn_cast_or_null<MDNode>(Op))
-              SP->replaceOperandWith(7, CU_SP.first);
-      CUSubprograms.clear();
-
-      MetadataList.tryToResolveCycles();
-      Placeholders.flush(MetadataList);
+      resolveForwardRefsAndPlaceholders(Placeholders);
+      upgradeCUSubprograms();
       return Error::success();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -486,20 +747,86 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
     // Read a record.
     Record.clear();
     StringRef Blob;
+    ++NumMDRecordLoaded;
     unsigned Code = Stream.readRecord(Entry.ID, Record, &Blob);
-    if (Error Err = parseOneMetadata(Record, Code, Placeholders, Blob,
-                                     ModuleLevel, NextMetadataNo))
+    if (Error Err =
+            parseOneMetadata(Record, Code, Placeholders, Blob, NextMetadataNo))
       return Err;
   }
 }
 
+MDString *MetadataLoader::MetadataLoaderImpl::lazyLoadOneMDString(unsigned ID) {
+  ++NumMDStringLoaded;
+  if (Metadata *MD = MetadataList.lookup(ID))
+    return cast<MDString>(MD);
+  auto MDS = MDString::get(Context, MDStringRef[ID]);
+  MetadataList.assignValue(MDS, ID);
+  return MDS;
+}
+
+void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata(
+    unsigned ID, PlaceholderQueue &Placeholders) {
+  assert(ID < (MDStringRef.size()) + GlobalMetadataBitPosIndex.size());
+  assert(ID >= MDStringRef.size() && "Unexpected lazy-loading of MDString");
+#ifndef NDEBUG
+  // Lookup first if the metadata hasn't already been loaded.
+  if (auto *MD = MetadataList.lookup(ID)) {
+    auto *N = dyn_cast_or_null<MDNode>(MD);
+    assert(N && N->isTemporary() && "Lazy loading an already loaded metadata");
+  }
+#endif
+  SmallVector<uint64_t, 64> Record;
+  StringRef Blob;
+  IndexCursor.JumpToBit(GlobalMetadataBitPosIndex[ID - MDStringRef.size()]);
+  auto Entry = IndexCursor.advanceSkippingSubblocks();
+  ++NumMDRecordLoaded;
+  unsigned Code = IndexCursor.readRecord(Entry.ID, Record, &Blob);
+  if (Error Err = parseOneMetadata(Record, Code, Placeholders, Blob, ID))
+    report_fatal_error("Can't lazyload MD");
+}
+
+/// Ensure that all forward-references and placeholders are resolved.
+/// Iteratively lazy-loading metadata on-demand if needed.
+void MetadataLoader::MetadataLoaderImpl::resolveForwardRefsAndPlaceholders(
+    PlaceholderQueue &Placeholders) {
+  DenseSet<unsigned> Temporaries;
+  while (1) {
+    // Populate Temporaries with the placeholders that haven't been loaded yet.
+    Placeholders.getTemporaries(MetadataList, Temporaries);
+
+    // If we don't have any temporary, or FwdReference, we're done!
+    if (Temporaries.empty() && !MetadataList.hasFwdRefs())
+      break;
+
+    // First, load all the temporaries. This can add new placeholders or
+    // forward references.
+    for (auto ID : Temporaries)
+      lazyLoadOneMetadata(ID, Placeholders);
+    Temporaries.clear();
+
+    // Second, load the forward-references. This can also add new placeholders
+    // or forward references.
+    while (MetadataList.hasFwdRefs())
+      lazyLoadOneMetadata(MetadataList.getNextFwdRef(), Placeholders);
+  }
+  // At this point we don't have any forward reference remaining, or temporary
+  // that haven't been loaded. We can safely drop RAUW support and mark cycles
+  // as resolved.
+  MetadataList.tryToResolveCycles();
+
+  // Finally, everything is in place, we can replace the placeholders operands
+  // with the final node they refer to.
+  Placeholders.flush(MetadataList);
+}
+
 Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     SmallVectorImpl<uint64_t> &Record, unsigned Code,
-    PlaceholderQueue &Placeholders, StringRef Blob, bool ModuleLevel,
-    unsigned &NextMetadataNo) {
+    PlaceholderQueue &Placeholders, StringRef Blob, unsigned &NextMetadataNo) {
 
   bool IsDistinct = false;
   auto getMD = [&](unsigned ID) -> Metadata * {
+    if (ID < MDStringRef.size())
+      return lazyLoadOneMDString(ID);
     if (!IsDistinct)
       return MetadataList.getMetadataFwdRef(ID);
     if (auto *MD = MetadataList.getMetadataIfResolved(ID))
@@ -519,7 +846,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
   auto getMDString = [&](unsigned ID) -> MDString * {
     // This requires that the ID is not really a forward reference.  In
     // particular, the MDString must already have been resolved.
-    return cast_or_null<MDString>(getMDOrNull(ID));
+    auto MDS = getMDOrNull(ID);
+    return cast_or_null<MDString>(MDS);
   };
 
   // Support for old type refs.
@@ -539,6 +867,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     Record.clear();
     Code = Stream.ReadCode();
 
+    ++NumMDRecordLoaded;
     unsigned NextBitCode = Stream.readRecord(Code, Record);
     if (NextBitCode != bitc::METADATA_NAMED_NODE)
       return error("METADATA_NAME not followed by METADATA_NAMED_NODE");
@@ -1137,15 +1466,20 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
     // Test for upgrading !llvm.loop.
     HasSeenOldLoopTags |= mayBeOldLoopAttachmentTag(String);
-
+    ++NumMDStringLoaded;
     Metadata *MD = MDString::get(Context, String);
     MetadataList.assignValue(MD, NextMetadataNo++);
     break;
   }
-  case bitc::METADATA_STRINGS:
-    if (Error Err = parseMetadataStrings(Record, Blob, NextMetadataNo))
+  case bitc::METADATA_STRINGS: {
+    auto CreateNextMDString = [&](StringRef Str) {
+      ++NumMDStringLoaded;
+      MetadataList.assignValue(MDString::get(Context, Str), NextMetadataNo++);
+    };
+    if (Error Err = parseMetadataStrings(Record, Blob, CreateNextMDString))
       return Err;
     break;
+  }
   case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: {
     if (Record.size() % 2 == 0)
       return error("Invalid record");
@@ -1166,12 +1500,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   }
-#undef GET_OR_DISTINCT
   return Error::success();
+#undef GET_OR_DISTINCT
 }
 
 Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings(
-    ArrayRef<uint64_t> Record, StringRef Blob, unsigned &NextMetadataNo) {
+    ArrayRef<uint64_t> Record, StringRef Blob,
+    std::function<void(StringRef)> CallBack) {
   // All the MDStrings in the block are emitted together in a single
   // record.  The strings are concatenated and stored in a blob along with
   // their sizes.
@@ -1197,8 +1532,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings(
     if (Strings.size() < Size)
       return error("Invalid record: metadata strings truncated chars");
 
-    MetadataList.assignValue(MDString::get(Context, Strings.slice(0, Size)),
-                             NextMetadataNo++);
+    CallBack(Strings.slice(0, Size));
     Strings = Strings.drop_front(Size);
   } while (--NumStrings);
 
@@ -1228,6 +1562,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
 
   SmallVector<uint64_t, 64> Record;
 
+  PlaceholderQueue Placeholders;
+
   while (true) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
 
@@ -1236,6 +1572,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
+      resolveForwardRefsAndPlaceholders(Placeholders);
       return Error::success();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -1244,6 +1581,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
 
     // Read a metadata attachment record.
     Record.clear();
+    ++NumMDRecordLoaded;
     switch (Stream.readRecord(Entry.ID, Record)) {
     default: // Default behavior: ignore.
       break;
@@ -1268,7 +1606,14 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
         if (I->second == LLVMContext::MD_tbaa && StripTBAA)
           continue;
 
-        Metadata *Node = MetadataList.getMetadataFwdRef(Record[i + 1]);
+        auto Idx = Record[i + 1];
+        if (Idx < (MDStringRef.size() + GlobalMetadataBitPosIndex.size()) &&
+            !MetadataList.lookup(Idx))
+          // Load the attachment if it is in the lazy-loadable range and hasn't
+          // been loaded yet.
+          lazyLoadOneMetadata(Idx, Placeholders);
+
+        Metadata *Node = MetadataList.getMetadataFwdRef(Idx);
         if (isa<LocalAsMetadata>(Node))
           // Drop the attachment.  This used to be legal, but there's no
           // upgrade path.
@@ -1331,6 +1676,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataKinds() {
 
     // Read a record.
     Record.clear();
+    ++NumMDRecordLoaded;
     unsigned Code = Stream.readRecord(Entry.ID, Record);
     switch (Code) {
     default: // Default behavior: ignore.
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index c10ba2399e7..ebb2022551f 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -784,53 +784,53 @@ void ModuleBitcodeWriter::writeTypeTable() {
   uint64_t NumBits = VE.computeBitsRequiredForTypeIndicies();
 
   // Abbrev for TYPE_CODE_POINTER.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_POINTER));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
   Abbv->Add(BitCodeAbbrevOp(0));  // Addrspace = 0
-  unsigned PtrAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned PtrAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_FUNCTION.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_FUNCTION));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // isvararg
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
 
-  unsigned FunctionAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FunctionAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_STRUCT_ANON.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_ANON));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
 
-  unsigned StructAnonAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned StructAnonAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_STRUCT_NAME.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAME));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-  unsigned StructNameAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned StructNameAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_STRUCT_NAMED.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAMED));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
 
-  unsigned StructNamedAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned StructNamedAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_ARRAY.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_ARRAY));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // size
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
 
-  unsigned ArrayAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned ArrayAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Emit an entry count so the reader can reserve space.
   TypeVals.push_back(TypeList.size());
@@ -971,9 +971,8 @@ static unsigned getEncodedLinkage(const GlobalValue &GV) {
 static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
   uint64_t RawFlags = 0;
 
-  RawFlags |= Flags.NoRename; // bool
-  RawFlags |= (Flags.IsNotViableToInline << 1);
-  RawFlags |= (Flags.HasInlineAsmMaybeReferencingInternal << 2);
+  RawFlags |= Flags.NotEligibleToImport; // bool
+  RawFlags |= (Flags.LiveRoot << 1);
   // Linkage don't need to be remapped at that time for the summary. Any future
   // change to the getEncodedLinkage() function will need to be taken into
   // account here as well.
@@ -1059,13 +1058,13 @@ void BitcodeWriterBase::writeValueSymbolTableForwardDecl() {
   // which is written after the function blocks so that it can include
   // the offset of each function. The placeholder offset will be
   // updated when the real VST is written.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_VSTOFFSET));
   // Blocks are 32-bit aligned, so we can use a 32-bit word offset to
   // hold the real VST offset. Must use fixed instead of VBR as we don't
   // know how many VBR chunks to reserve ahead of time.
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
-  unsigned VSTOffsetAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned VSTOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Emit the placeholder
   uint64_t Vals[] = {bitc::MODULE_CODE_VSTOFFSET, 0};
@@ -1155,7 +1154,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
   unsigned SimpleGVarAbbrev = 0;
   if (!M.global_empty()) {
     // Add an abbrev for common globals with no visibility or thread localness.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_GLOBALVAR));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                               Log2_32_Ceil(MaxGlobalType+1)));
@@ -1177,7 +1176,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
       Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                                Log2_32_Ceil(SectionMap.size()+1)));
     // Don't bother emitting vis + thread local.
-    SimpleGVarAbbrev = Stream.EmitAbbrev(Abbv);
+    SimpleGVarAbbrev = Stream.EmitAbbrev(std::move(Abbv));
   }
 
   // Emit the global variable information.
@@ -1285,11 +1284,11 @@ void ModuleBitcodeWriter::writeModuleInfo() {
       AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7);
 
     // MODULE_CODE_SOURCE_FILENAME: [namechar x N]
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_SOURCE_FILENAME));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(AbbrevOpToUse);
-    unsigned FilenameAbbrev = Stream.EmitAbbrev(Abbv);
+    unsigned FilenameAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     for (const auto P : M.getSourceFileName())
       Vals.push_back((unsigned char)P);
@@ -1360,14 +1359,14 @@ void ModuleBitcodeWriter::writeMDTuple(const MDTuple *N,
 unsigned ModuleBitcodeWriter::createDILocationAbbrev() {
   // Assume the column is usually under 128, and always output the inlined-at
   // location (it's never more expensive than building an array size 1).
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_LOCATION));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  return Stream.EmitAbbrev(Abbv);
+  return Stream.EmitAbbrev(std::move(Abbv));
 }
 
 void ModuleBitcodeWriter::writeDILocation(const DILocation *N,
@@ -1389,7 +1388,7 @@ void ModuleBitcodeWriter::writeDILocation(const DILocation *N,
 unsigned ModuleBitcodeWriter::createGenericDINodeAbbrev() {
   // Assume the column is usually under 128, and always output the inlined-at
   // location (it's never more expensive than building an array size 1).
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_GENERIC_DEBUG));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
@@ -1397,7 +1396,7 @@ unsigned ModuleBitcodeWriter::createGenericDINodeAbbrev() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  return Stream.EmitAbbrev(Abbv);
+  return Stream.EmitAbbrev(std::move(Abbv));
 }
 
 void ModuleBitcodeWriter::writeGenericDINode(const GenericDINode *N,
@@ -1790,11 +1789,11 @@ void ModuleBitcodeWriter::writeDIImportedEntity(
 }
 
 unsigned ModuleBitcodeWriter::createNamedMetadataAbbrev() {
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_NAME));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-  return Stream.EmitAbbrev(Abbv);
+  return Stream.EmitAbbrev(std::move(Abbv));
 }
 
 void ModuleBitcodeWriter::writeNamedMetadata(
@@ -1819,12 +1818,12 @@ void ModuleBitcodeWriter::writeNamedMetadata(
 }
 
 unsigned ModuleBitcodeWriter::createMetadataStringsAbbrev() {
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRINGS));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // # of strings
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // offset to chars
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-  return Stream.EmitAbbrev(Abbv);
+  return Stream.EmitAbbrev(std::move(Abbv));
 }
 
 /// Write out a record for MDString.
@@ -1918,17 +1917,17 @@ void ModuleBitcodeWriter::writeModuleMetadata() {
   MDAbbrevs[MetadataAbbrev::GenericDINodeAbbrevID] =
       createGenericDINodeAbbrev();
 
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_INDEX_OFFSET));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
-  unsigned OffsetAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned OffsetAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_INDEX));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  unsigned IndexAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned IndexAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Emit MDStrings together upfront.
   writeMetadataStrings(VE.getMDStrings(), Record);
@@ -2125,30 +2124,30 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
   // If this is a constant pool for the module, emit module-specific abbrevs.
   if (isGlobal) {
     // Abbrev for CST_CODE_AGGREGATE.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_AGGREGATE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(LastVal+1)));
-    AggregateAbbrev = Stream.EmitAbbrev(Abbv);
+    AggregateAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     // Abbrev for CST_CODE_STRING.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_STRING));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-    String8Abbrev = Stream.EmitAbbrev(Abbv);
+    String8Abbrev = Stream.EmitAbbrev(std::move(Abbv));
     // Abbrev for CST_CODE_CSTRING.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
-    CString7Abbrev = Stream.EmitAbbrev(Abbv);
+    CString7Abbrev = Stream.EmitAbbrev(std::move(Abbv));
     // Abbrev for CST_CODE_CSTRING.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-    CString6Abbrev = Stream.EmitAbbrev(Abbv);
+    CString6Abbrev = Stream.EmitAbbrev(std::move(Abbv));
   }
 
   SmallVector<uint64_t, 64> Record;
@@ -2858,39 +2857,39 @@ void ModuleBitcodeWriter::writeValueSymbolTable(
   unsigned GUIDEntryAbbrev;
   if (IsModuleLevel && hasVSTOffsetPlaceholder()) {
     // 8-bit fixed-width VST_CODE_FNENTRY function strings.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-    FnEntry8BitAbbrev = Stream.EmitAbbrev(Abbv);
+    FnEntry8BitAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     // 7-bit fixed width VST_CODE_FNENTRY function strings.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
-    FnEntry7BitAbbrev = Stream.EmitAbbrev(Abbv);
+    FnEntry7BitAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     // 6-bit char6 VST_CODE_FNENTRY function strings.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-    FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv);
+    FnEntry6BitAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     // FIXME: Change the name of this record as it is now used by
     // the per-module index as well.
-    Abbv = new BitCodeAbbrev();
+    Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid
-    GUIDEntryAbbrev = Stream.EmitAbbrev(Abbv);
+    GUIDEntryAbbrev = Stream.EmitAbbrev(std::move(Abbv));
   }
 
   // FIXME: Set up the abbrev, we know how many values there are!
@@ -2984,11 +2983,11 @@ void IndexBitcodeWriter::writeCombinedValueSymbolTable() {
 
   Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
 
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid
-  unsigned EntryAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned EntryAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   SmallVector<uint64_t, 64> NameVals;
   for (const auto &GVI : valueIds()) {
@@ -3121,7 +3120,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   Stream.EnterBlockInfoBlock();
 
   { // 8-bit fixed-width VST_CODE_ENTRY/VST_CODE_BBENTRY strings.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -3132,7 +3131,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   }
 
   { // 7-bit fixed width VST_CODE_ENTRY strings.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -3142,7 +3141,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // 6-bit char6 VST_CODE_ENTRY strings.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -3152,7 +3151,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // 6-bit char6 VST_CODE_BBENTRY strings.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_BBENTRY));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -3165,7 +3164,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
 
 
   { // SETTYPE abbrev for CONSTANTS_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                               VE.computeBitsRequiredForTypeIndicies()));
@@ -3175,7 +3174,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   }
 
   { // INTEGER abbrev for CONSTANTS_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_INTEGER));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
@@ -3184,7 +3183,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   }
 
   { // CE_CAST abbrev for CONSTANTS_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // cast opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // typeid
@@ -3196,7 +3195,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // NULL abbrev for CONSTANTS_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_NULL));
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
         CONSTANTS_NULL_Abbrev)
@@ -3206,7 +3205,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   // FIXME: This should only use space for first class types!
 
   { // INST_LOAD abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
@@ -3218,7 +3217,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_BINOP abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
@@ -3228,7 +3227,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_BINOP_FLAGS abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
@@ -3239,7 +3238,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_CAST abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // OpVal
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // dest ty
@@ -3251,14 +3250,14 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   }
 
   { // INST_RET abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_RET_VOID_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_RET abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ValID
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
@@ -3266,14 +3265,14 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   { // INST_UNREACHABLE abbrev for FUNCTION_BLOCK.
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNREACHABLE));
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_UNREACHABLE_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
   {
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_GEP));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, // dest ty
@@ -3296,38 +3295,38 @@ void IndexBitcodeWriter::writeModStrings() {
   // TODO: See which abbrev sizes we actually need to emit
 
   // 8-bit fixed-width MST_ENTRY strings.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-  unsigned Abbrev8Bit = Stream.EmitAbbrev(Abbv);
+  unsigned Abbrev8Bit = Stream.EmitAbbrev(std::move(Abbv));
 
   // 7-bit fixed width MST_ENTRY strings.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
-  unsigned Abbrev7Bit = Stream.EmitAbbrev(Abbv);
+  unsigned Abbrev7Bit = Stream.EmitAbbrev(std::move(Abbv));
 
   // 6-bit char6 MST_ENTRY strings.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_ENTRY));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-  unsigned Abbrev6Bit = Stream.EmitAbbrev(Abbv);
+  unsigned Abbrev6Bit = Stream.EmitAbbrev(std::move(Abbv));
 
   // Module Hash, 160 bits SHA1. Optionally, emitted after each MST_CODE_ENTRY.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::MST_CODE_HASH));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
-  unsigned AbbrevHash = Stream.EmitAbbrev(Abbv);
+  unsigned AbbrevHash = Stream.EmitAbbrev(std::move(Abbv));
 
   SmallVector<unsigned, 64> Vals;
   for (const auto &MPSE : Index.modulePaths()) {
@@ -3435,7 +3434,7 @@ void ModuleBitcodeWriter::writeModuleLevelReferences(
 // Current version for the summary.
 // This is bumped whenever we introduce changes in the way some record are
 // interpreted, like flags for instance.
-static const uint64_t INDEX_VERSION = 2;
+static const uint64_t INDEX_VERSION = 3;
 
 /// Emit the per-module summary section alongside the rest of
 /// the module's bitcode.
@@ -3450,7 +3449,7 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
   }
 
   // Abbrev for FS_PERMODULE.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
@@ -3459,10 +3458,10 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
   // numrefs x valueid, n x (valueid)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSCallsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_PERMODULE_PROFILE.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
@@ -3471,24 +3470,24 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
   // numrefs x valueid, n x (valueid, hotness)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_PERMODULE_GLOBALVAR_INIT_REFS.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_GLOBALVAR_INIT_REFS));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));  // valueids
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSModRefsAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSModRefsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_ALIAS.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_ALIAS));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  unsigned FSAliasAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   SmallVector<uint64_t, 64> NameVals;
   // Iterate over the list of functions instead of the Index to
@@ -3542,7 +3541,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Stream.EmitRecord(bitc::FS_VERSION, ArrayRef<uint64_t>{INDEX_VERSION});
 
   // Abbrev for FS_COMBINED.
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
@@ -3552,10 +3551,10 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   // numrefs x valueid, n x (valueid)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSCallsAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSCallsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_COMBINED_PROFILE.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_PROFILE));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
@@ -3565,26 +3564,26 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   // numrefs x valueid, n x (valueid, hotness)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSCallsProfileAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_COMBINED_GLOBALVAR_INIT_REFS.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_GLOBALVAR_INIT_REFS));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));    // valueids
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned FSModRefsAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSModRefsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_COMBINED_ALIAS.
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALIAS));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  unsigned FSAliasAbbrev = Stream.EmitAbbrev(Abbv);
+  unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // The aliases are emitted as a post-pass, and will point to the value
   // id of the aliasee. Save them in a vector for post-processing.
@@ -3702,19 +3701,19 @@ void writeIdentificationBlock(BitstreamWriter &Stream) {
   Stream.EnterSubblock(bitc::IDENTIFICATION_BLOCK_ID, 5);
 
   // Write the "user readable" string identifying the bitcode producer
-  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_STRING));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-  auto StringAbbrev = Stream.EmitAbbrev(Abbv);
+  auto StringAbbrev = Stream.EmitAbbrev(std::move(Abbv));
   writeStringRecord(Stream, bitc::IDENTIFICATION_CODE_STRING,
                     "LLVM" LLVM_VERSION_STRING, StringAbbrev);
 
   // Write the epoch version
-  Abbv = new BitCodeAbbrev();
+  Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_EPOCH));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  auto EpochAbbrev = Stream.EmitAbbrev(Abbv);
+  auto EpochAbbrev = Stream.EmitAbbrev(std::move(Abbv));
   SmallVector<unsigned, 1> Vals = {bitc::BITCODE_CURRENT_EPOCH};
   Stream.EmitRecord(bitc::IDENTIFICATION_CODE_EPOCH, Vals, EpochAbbrev);
   Stream.ExitBlock();
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 0c79def8793..61149d9229b 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -53,7 +53,8 @@ void ARMException::beginFunction(const MachineFunction *MF) {
 
   if (MoveType == AsmPrinter::CFI_M_Debug) {
     if (!hasEmittedCFISections) {
-      Asm->OutStreamer->EmitCFISections(false, true);
+      if (Asm->needsOnlyDebugCFIMoves())
+        Asm->OutStreamer->EmitCFISections(false, true);
       hasEmittedCFISections = true;
     }
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 5f15ac1d503..9f6caa95a9e 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -108,7 +108,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL,
 AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
     : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
       OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)),
-      LastMI(nullptr), LastFn(0), Counter(~0U) {
+      isCFIMoveForDebugging(false), LastMI(nullptr), LastFn(0), Counter(~0U) {
   DD = nullptr;
   MMI = nullptr;
   LI = nullptr;
@@ -264,6 +264,28 @@ bool AsmPrinter::doInitialization(Module &M) {
     }
   }
 
+  switch (MAI->getExceptionHandlingType()) {
+  case ExceptionHandling::SjLj:
+  case ExceptionHandling::DwarfCFI:
+  case ExceptionHandling::ARM:
+    isCFIMoveForDebugging = true;
+    if (MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI)
+      break;
+    for (auto &F: M.getFunctionList()) {
+      // If the module contains any function with unwind data,
+      // .eh_frame has to be emitted.
+      // Ignore functions that won't get emitted.
+      if (!F.isDeclarationForLinker() && F.needsUnwindTableEntry()) {
+        isCFIMoveForDebugging = false;
+        break;
+      }
+    }
+    break;
+  default:
+    isCFIMoveForDebugging = false;
+    break;
+  }
+
   EHStreamer *ES = nullptr;
   switch (MAI->getExceptionHandlingType()) {
   case ExceptionHandling::None:
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 20075e41977..57864e4e4d4 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -100,6 +100,8 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   }
 
   SourceMgr SrcMgr;
+  SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
+
   SrcMgrDiagInfo DiagInfo;
 
   // If the current LLVMContext has an inline asm handler, set it in SourceMgr.
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index ef30e279aed..e08306b001f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -137,7 +137,7 @@ void DwarfCFIException::beginFragment(const MachineBasicBlock *MBB,
     return;
 
   if (!hasEmittedCFISections) {
-    if (Asm->needsCFIMoves() == AsmPrinter::CFI_M_Debug)
+    if (Asm->needsOnlyDebugCFIMoves())
       Asm->OutStreamer->EmitCFISections(false, true);
     hasEmittedCFISections = true;
   }
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index cf35afbc6e5..89a042ffc47 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -125,8 +125,11 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) {
 MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) {
   MachineBasicBlock *&MBB = BBToMBB[&BB];
   if (!MBB) {
-    MBB = MF->CreateMachineBasicBlock();
+    MBB = MF->CreateMachineBasicBlock(&BB);
     MF->push_back(MBB);
+
+    if (BB.hasAddressTaken())
+      MBB->setHasAddressTaken();
   }
   return *MBB;
 }
@@ -195,6 +198,45 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   return true;
 }
 
+bool IRTranslator::translateSwitch(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  // For now, just translate as a chain of conditional branches.
+  // FIXME: could we share most of the logic/code in
+  // SelectionDAGBuilder::visitSwitch between SelectionDAG and GlobalISel?
+  // At first sight, it seems most of the logic in there is independent of
+  // SelectionDAG-specifics and a lot of work went in to optimize switch
+  // lowering in there.
+
+  const SwitchInst &SwInst = cast<SwitchInst>(U);
+  const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition());
+
+  LLT LLTi1 = LLT(*Type::getInt1Ty(U.getContext()), *DL);
+  for (auto &CaseIt : SwInst.cases()) {
+    const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue());
+    const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1);
+    MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue);
+    MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+    MachineBasicBlock &TrueBB = getOrCreateBB(*CaseIt.getCaseSuccessor());
+
+    MIRBuilder.buildBrCond(Tst, TrueBB);
+    CurBB.addSuccessor(&TrueBB);
+
+    MachineBasicBlock *FalseBB =
+        MF->CreateMachineBasicBlock(SwInst.getParent());
+    MF->push_back(FalseBB);
+    MIRBuilder.buildBr(*FalseBB);
+    CurBB.addSuccessor(FalseBB);
+
+    MIRBuilder.setMBB(*FalseBB);
+  }
+  // handle default case
+  MachineBasicBlock &DefaultBB = getOrCreateBB(*SwInst.getDefaultDest());
+  MIRBuilder.buildBr(DefaultBB);
+  MIRBuilder.getMBB().addSuccessor(&DefaultBB);
+
+  return true;
+}
+
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
 
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index a6c93bc0f3d..7d405dd92ac 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -55,11 +55,10 @@ const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1;
 RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks,
                                    unsigned NumRegBanks)
     : RegBanks(RegBanks), NumRegBanks(NumRegBanks) {
-  DEBUG(for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
+#ifndef NDEBUG
+  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx)
     assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank");
-    assert(!RegBanks[Idx]->isValid() &&
-           "RegisterBank should be invalid before initialization");
-  });
+#endif // NDEBUG
 }
 
 RegisterBankInfo::~RegisterBankInfo() {
@@ -70,13 +69,15 @@ RegisterBankInfo::~RegisterBankInfo() {
 }
 
 bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
-  DEBUG(for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
+#ifndef NDEBUG
+  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
     const RegisterBank &RegBank = getRegBank(Idx);
     assert(Idx == RegBank.getID() &&
            "ID does not match the index in the array");
     dbgs() << "Verify " << RegBank << '\n';
     assert(RegBank.verify(TRI) && "RegBank is invalid");
-  });
+  }
+#endif // NDEBUG
   return true;
 }
 
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 0cac7b71e24..b9f3d86eabd 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1495,16 +1495,18 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
     if (TII->reverseBranchCondition(Cond))
       llvm_unreachable("Unable to reverse branch condition!");
 
-  // Initialize liveins to the first BB. These are potentiall redefined by
-  // predicated instructions.
   Redefs.init(*TRI);
-  Redefs.addLiveIns(CvtMBB);
-  Redefs.addLiveIns(NextMBB);
-
-  // Compute a set of registers which must not be killed by instructions in
-  // BB1: This is everything live-in to BB2.
   DontKill.init(*TRI);
-  DontKill.addLiveIns(NextMBB);
+
+  if (MRI->tracksLiveness()) {
+    // Initialize liveins to the first BB. These are potentiall redefined by
+    // predicated instructions.
+    Redefs.addLiveIns(CvtMBB);
+    Redefs.addLiveIns(NextMBB);
+    // Compute a set of registers which must not be killed by instructions in
+    // BB1: This is everything live-in to BB2.
+    DontKill.addLiveIns(NextMBB);
+  }
 
   if (CvtMBB.pred_size() > 1) {
     BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
@@ -1602,8 +1604,10 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
   Redefs.init(*TRI);
-  Redefs.addLiveIns(CvtMBB);
-  Redefs.addLiveIns(NextMBB);
+  if (MRI->tracksLiveness()) {
+    Redefs.addLiveIns(CvtMBB);
+    Redefs.addLiveIns(NextMBB);
+  }
 
   DontKill.clear();
 
@@ -1766,8 +1770,10 @@ bool IfConverter::IfConvertDiamondCommon(
   //   instructions. We start with BB1 live-ins so we have the live-out regs
   //   after tracking the BB1 instructions.
   Redefs.init(*TRI);
-  Redefs.addLiveIns(MBB1);
-  Redefs.addLiveIns(MBB2);
+  if (MRI->tracksLiveness()) {
+    Redefs.addLiveIns(MBB1);
+    Redefs.addLiveIns(MBB2);
+  }
 
   // Remove the duplicated instructions at the beginnings of both paths.
   // Skip dbg_value instructions
@@ -1792,12 +1798,14 @@ bool IfConverter::IfConvertDiamondCommon(
   // This is everything used+live in BB2 after the duplicated instructions. We
   // can compute this set by simulating liveness backwards from the end of BB2.
   DontKill.init(*TRI);
-  for (const MachineInstr &MI : make_range(MBB2.rbegin(), ++DI2.getReverse()))
-    DontKill.stepBackward(MI);
+  if (MRI->tracksLiveness()) {
+    for (const MachineInstr &MI : make_range(MBB2.rbegin(), ++DI2.getReverse()))
+      DontKill.stepBackward(MI);
 
-  for (const MachineInstr &MI : make_range(MBB1.begin(), DI1)) {
-    SmallVector<std::pair<unsigned, const MachineOperand*>, 4> IgnoredClobbers;
-    Redefs.stepForward(MI, IgnoredClobbers);
+    for (const MachineInstr &MI : make_range(MBB1.begin(), DI1)) {
+      SmallVector<std::pair<unsigned, const MachineOperand*>, 4> Dummy;
+      Redefs.stepForward(MI, Dummy);
+    }
   }
   BBI.BB->splice(BBI.BB->end(), &MBB1, MBB1.begin(), DI1);
   MBB2.erase(MBB2.begin(), DI2);
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index eb13d2d3ec0..db87092177c 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -488,16 +488,16 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
   }
 
   // Print the live in registers.
-  const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
-  assert(TRI && "Expected target register info");
-  if (!MBB.livein_empty()) {
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  if (MRI.tracksLiveness() && !MBB.livein_empty()) {
+    const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
     OS.indent(2) << "liveins: ";
     bool First = true;
     for (const auto &LI : MBB.liveins()) {
       if (!First)
         OS << ", ";
       First = false;
-      printReg(LI.PhysReg, OS, TRI);
+      printReg(LI.PhysReg, OS, &TRI);
       if (!LI.LaneMask.all())
         OS << ":0x" << PrintLaneMask(LI.LaneMask);
     }
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 549424d257f..3869f976854 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -286,7 +286,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
   if (!livein_empty()) {
     if (Indexes) OS << '\t';
     OS << "    Live Ins:";
-    for (const auto &LI : make_range(livein_begin(), livein_end())) {
+    for (const auto &LI : LiveIns) {
       OS << ' ' << PrintReg(LI.PhysReg, TRI);
       if (!LI.LaneMask.all())
         OS << ':' << PrintLaneMask(LI.LaneMask);
@@ -1292,3 +1292,10 @@ MachineBasicBlock::getEndClobberMask(const TargetRegisterInfo *TRI) const {
 void MachineBasicBlock::clearLiveIns() {
   LiveIns.clear();
 }
+
+MachineBasicBlock::livein_iterator MachineBasicBlock::livein_begin() const {
+  assert(getParent()->getProperties().hasProperty(
+      MachineFunctionProperties::Property::TracksLiveness) &&
+      "Liveness information is accurate");
+  return LiveIns.begin();
+}
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 426a4666c64..a98139f9e5a 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -566,7 +566,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   FirstTerminator = nullptr;
 
   if (!MF->getProperties().hasProperty(
-      MachineFunctionProperties::Property::NoPHIs)) {
+      MachineFunctionProperties::Property::NoPHIs) && MRI->tracksLiveness()) {
     // If this block has allocatable physical registers live-in, check that
     // it is an entry block or landing pad.
     for (const auto &LI : MBB->liveins()) {
@@ -741,14 +741,16 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   }
 
   regsLive.clear();
-  for (const auto &LI : MBB->liveins()) {
-    if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) {
-      report("MBB live-in list contains non-physical register", MBB);
-      continue;
+  if (MRI->tracksLiveness()) {
+    for (const auto &LI : MBB->liveins()) {
+      if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) {
+        report("MBB live-in list contains non-physical register", MBB);
+        continue;
+      }
+      for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        regsLive.insert(*SubRegs);
     }
-    for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
-         SubRegs.isValid(); ++SubRegs)
-      regsLive.insert(*SubRegs);
   }
   regsLiveInButUnused = regsLive;
 
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index de1c35caa1a..fdf741fd58f 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -48,11 +48,6 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
   assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) &&
          "Target changed?");
 
-  // It is not possible to use the register scavenger after late optimization
-  // passes that don't preserve accurate liveness information.
-  assert(MRI->tracksLiveness() &&
-         "Cannot use register scavenger with inaccurate liveness");
-
   // Self-initialize.
   if (!this->MBB) {
     NumRegUnits = TRI->getNumRegUnits();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index cb803585282..a07bd8f8354 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -40,6 +40,7 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -7339,19 +7340,23 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
   if (!Range)
     return Op;
 
-  Constant *Lo = cast<ConstantAsMetadata>(Range->getOperand(0))->getValue();
-  if (!Lo->isNullValue())
+  ConstantRange CR = getConstantRangeFromMetadata(*Range);
+  if (CR.isFullSet() || CR.isEmptySet() || CR.isWrappedSet())
     return Op;
 
-  Constant *Hi = cast<ConstantAsMetadata>(Range->getOperand(1))->getValue();
-  unsigned Bits = cast<ConstantInt>(Hi)->getValue().logBase2();
+  APInt Lo = CR.getUnsignedMin();
+  if (!Lo.isMinValue())
+    return Op;
+
+  APInt Hi = CR.getUnsignedMax();
+  unsigned Bits = Hi.getActiveBits();
 
   EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
 
   SDLoc SL = getCurSDLoc();
 
-  SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(),
-                             Op, DAG.getValueType(SmallVT));
+  SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(), Op,
+                             DAG.getValueType(SmallVT));
   unsigned NumVals = Op.getNode()->getNumValues();
   if (NumVals == 1)
     return ZExt;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index deec1633022..2aac3474654 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -299,11 +299,8 @@ DWARFDie::collectChildrenAddressRanges(DWARFAddressRangesVector& Ranges) const {
     Ranges.insert(Ranges.end(), DIERanges.begin(), DIERanges.end());
   }
 
-  DWARFDie Child = getFirstChild();
-  while (Child) {
+  for (auto Child: children())
     Child.collectChildrenAddressRanges(Ranges);
-    Child = Child.getSibling();
-  }
 }
 
 bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const {
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index e6c9764f113..2bbcb25275e 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -468,6 +468,7 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   Options.HandleInt = Flags.handle_int;
   Options.HandleSegv = Flags.handle_segv;
   Options.HandleTerm = Flags.handle_term;
+  Options.HandleXfsz = Flags.handle_xfsz;
   SetSignalHandler(Options);
 
   if (Flags.minimize_crash_internal_step)
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 08eaad9856b..22aad353ace 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -91,6 +91,7 @@ FUZZER_FLAG_INT(handle_ill, 1, "If 1, try to intercept SIGILL.")
 FUZZER_FLAG_INT(handle_fpe, 1, "If 1, try to intercept SIGFPE.")
 FUZZER_FLAG_INT(handle_int, 1, "If 1, try to intercept SIGINT.")
 FUZZER_FLAG_INT(handle_term, 1, "If 1, try to intercept SIGTERM.")
+FUZZER_FLAG_INT(handle_xfsz, 1, "If 1, try to intercept SIGXFSZ.")
 FUZZER_FLAG_INT(close_fd_mask, 0, "If 1, close stdout at startup; "
     "if 2, close stderr; if 3, close both. "
     "Be careful, this will also close e.g. asan's stderr/stdout.")
diff --git a/lib/Fuzzer/FuzzerIO.h b/lib/Fuzzer/FuzzerIO.h
index 741fecf415b..15bfd3d3472 100644
--- a/lib/Fuzzer/FuzzerIO.h
+++ b/lib/Fuzzer/FuzzerIO.h
@@ -37,6 +37,9 @@ std::string DirPlusFile(const std::string &DirPath,
 // Returns the name of the dir, similar to the 'dirname' utility.
 std::string DirName(const std::string &FileName);
 
+// Returns path to a TmpDir.
+std::string TmpDir();
+
 void DupAndCloseStderr();
 
 void CloseStdout();
diff --git a/lib/Fuzzer/FuzzerIOPosix.cpp b/lib/Fuzzer/FuzzerIOPosix.cpp
index 720bc130459..6d8edf6ff53 100644
--- a/lib/Fuzzer/FuzzerIOPosix.cpp
+++ b/lib/Fuzzer/FuzzerIOPosix.cpp
@@ -83,6 +83,12 @@ std::string DirName(const std::string &FileName) {
   return Res;
 }
 
+std::string TmpDir() {
+  if (auto Env = getenv("TMPDIR"))
+    return Env;
+  return "/tmp";
+}
+
 }  // namespace fuzzer
 
 #endif // LIBFUZZER_POSIX
diff --git a/lib/Fuzzer/FuzzerIOWindows.cpp b/lib/Fuzzer/FuzzerIOWindows.cpp
index a4738eb9dfe..056f0721a33 100644
--- a/lib/Fuzzer/FuzzerIOWindows.cpp
+++ b/lib/Fuzzer/FuzzerIOWindows.cpp
@@ -277,6 +277,8 @@ std::string DirName(const std::string &FileName) {
   return FileName.substr(0, LocationLen + DirLen);
 }
 
+std::string TmpDir() { return "TODO: implement TmpDir"; }
+
 }  // namespace fuzzer
 
 #endif // LIBFUZZER_WINDOWS
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index c041706092d..0d2c7a78aca 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -82,6 +82,7 @@ class Fuzzer {
   static void StaticAlarmCallback();
   static void StaticCrashSignalCallback();
   static void StaticInterruptCallback();
+  static void StaticFileSizeExceedCallback();
 
   void ExecuteCallback(const uint8_t *Data, size_t Size);
   size_t RunOne(const uint8_t *Data, size_t Size);
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 1336f5e4aee..9f49d155799 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -266,6 +266,11 @@ void Fuzzer::StaticInterruptCallback() {
   F->InterruptCallback();
 }
 
+void Fuzzer::StaticFileSizeExceedCallback() {
+  Printf("==%lu== ERROR: libFuzzer: file size exceeded\n", GetPid());
+  exit(1);
+}
+
 void Fuzzer::CrashCallback() {
   Printf("==%lu== ERROR: libFuzzer: deadly signal\n", GetPid());
   if (EF->__sanitizer_print_stack_trace)
diff --git a/lib/Fuzzer/FuzzerMerge.cpp b/lib/Fuzzer/FuzzerMerge.cpp
index 84660e0fe53..9e559115680 100644
--- a/lib/Fuzzer/FuzzerMerge.cpp
+++ b/lib/Fuzzer/FuzzerMerge.cpp
@@ -220,8 +220,8 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
     ListFilesInDirRecursive(Corpora[i], nullptr, &AllFiles, /*TopDir*/true);
   Printf("MERGE-OUTER: %zd files, %zd in the initial corpus\n",
          AllFiles.size(), NumFilesInFirstCorpus);
-  std::string CFPath =
-      "libFuzzerTemp." + std::to_string(GetPid()) + ".txt";
+  auto CFPath = DirPlusFile(TmpDir(),
+                       "libFuzzerTemp." + std::to_string(GetPid()) + ".txt");
   // Write the control file.
   RemoveFile(CFPath);
   std::ofstream ControlFile(CFPath);
@@ -229,6 +229,11 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
   ControlFile << NumFilesInFirstCorpus << "\n";
   for (auto &Path: AllFiles)
     ControlFile << Path << "\n";
+  if (!ControlFile) {
+    Printf("MERGE-OUTER: failed to write to the control file: %s\n",
+           CFPath.c_str());
+    exit(1);
+  }
   ControlFile.close();
 
   // Execute the inner process untill it passes.
@@ -246,6 +251,9 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
   // Read the control file and do the merge.
   Merger M;
   std::ifstream IF(CFPath);
+  IF.seekg(0, IF.end);
+  Printf("MERGE-OUTER: the control file has %zd bytes\n", (size_t)IF.tellg());
+  IF.seekg(0, IF.beg);
   M.ParseOrExit(IF, true);
   IF.close();
   std::vector<std::string> NewFiles;
diff --git a/lib/Fuzzer/FuzzerOptions.h b/lib/Fuzzer/FuzzerOptions.h
index cb702d28520..6f72205600b 100644
--- a/lib/Fuzzer/FuzzerOptions.h
+++ b/lib/Fuzzer/FuzzerOptions.h
@@ -62,6 +62,7 @@ struct FuzzingOptions {
   bool HandleInt = false;
   bool HandleSegv = false;
   bool HandleTerm = false;
+  bool HandleXfsz = false;
 };
 
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerTraceState.cpp b/lib/Fuzzer/FuzzerTraceState.cpp
index be62a6624b2..2ad9702fab0 100644
--- a/lib/Fuzzer/FuzzerTraceState.cpp
+++ b/lib/Fuzzer/FuzzerTraceState.cpp
@@ -46,10 +46,6 @@ class TraceState {
   void TraceMemcmpCallback(size_t CmpSize, const uint8_t *Data1,
                            const uint8_t *Data2);
 
-  void TraceSwitchCallback(uintptr_t PC, size_t ValSizeInBits, uint64_t Val,
-                           size_t NumCases, uint64_t *Cases);
-  int TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,
-                          size_t DataSize);
   int TryToAddDesiredData(const uint8_t *PresentData,
                           const uint8_t *DesiredData, size_t DataSize);
 
@@ -147,29 +143,6 @@ class TraceState {
   size_t AutoDictAdds = 0;
 };
 
-int TraceState::TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,
-                                    size_t DataSize) {
-  if (NumMutations >= kMaxMutations || !WantToHandleOneMoreMutation()) return 0;
-  ScopedDoingMyOwnMemmem scoped_doing_my_own_memmem;
-  const uint8_t *UnitData;
-  auto UnitSize = F->GetCurrentUnitInFuzzingThead(&UnitData);
-  int Res = 0;
-  const uint8_t *Beg = UnitData;
-  const uint8_t *End = Beg + UnitSize;
-  for (const uint8_t *Cur = Beg; Cur < End; Cur++) {
-    Cur = (uint8_t *)SearchMemory(Cur, End - Cur, &PresentData, DataSize);
-    if (!Cur)
-      break;
-    size_t Pos = Cur - Beg;
-    assert(Pos < UnitSize);
-    AddMutation(Pos, DataSize, DesiredData);
-    AddMutation(Pos, DataSize, DesiredData + 1);
-    AddMutation(Pos, DataSize, DesiredData - 1);
-    Res++;
-  }
-  return Res;
-}
-
 int TraceState::TryToAddDesiredData(const uint8_t *PresentData,
                                     const uint8_t *DesiredData,
                                     size_t DataSize) {
@@ -206,26 +179,6 @@ void TraceState::TraceMemcmpCallback(size_t CmpSize, const uint8_t *Data1,
   }
 }
 
-void TraceState::TraceSwitchCallback(uintptr_t PC, size_t ValSizeInBits,
-                                     uint64_t Val, size_t NumCases,
-                                     uint64_t *Cases) {
-  if (F->InFuzzingThread()) return;
-  size_t ValSize = ValSizeInBits / 8;
-  bool TryShort = IsTwoByteData(Val);
-  for (size_t i = 0; i < NumCases; i++)
-    TryShort &= IsTwoByteData(Cases[i]);
-
-  if (Options.Verbosity >= 3)
-    Printf("TraceSwitch: %p %zd # %zd; TryShort %d\n", PC, Val, NumCases,
-           TryShort);
-
-  for (size_t i = 0; i < NumCases; i++) {
-    TryToAddDesiredData(Val, Cases[i], ValSize);
-    if (TryShort)
-      TryToAddDesiredData(Val, Cases[i], 2);
-  }
-}
-
 static TraceState *TS;
 
 void Fuzzer::StartTraceRecording() {
diff --git a/lib/Fuzzer/FuzzerUtilPosix.cpp b/lib/Fuzzer/FuzzerUtilPosix.cpp
index 8b484b8effa..e8d48dc81a3 100644
--- a/lib/Fuzzer/FuzzerUtilPosix.cpp
+++ b/lib/Fuzzer/FuzzerUtilPosix.cpp
@@ -41,6 +41,10 @@ static void InterruptHandler(int, siginfo_t *, void *) {
   Fuzzer::StaticInterruptCallback();
 }
 
+static void FileSizeExceedHandler(int, siginfo_t *, void *) {
+  Fuzzer::StaticFileSizeExceedCallback();
+}
+
 static void SetSigaction(int signum,
                          void (*callback)(int, siginfo_t *, void *)) {
   struct sigaction sigact;
@@ -80,6 +84,8 @@ void SetSignalHandler(const FuzzingOptions& Options) {
     SetSigaction(SIGILL, CrashHandler);
   if (Options.HandleFpe)
     SetSigaction(SIGFPE, CrashHandler);
+  if (Options.HandleXfsz)
+    SetSigaction(SIGXFSZ, FileSizeExceedHandler);
 }
 
 void SleepSeconds(int Seconds) {
diff --git a/lib/Fuzzer/FuzzerUtilWindows.cpp b/lib/Fuzzer/FuzzerUtilWindows.cpp
index 64adb7cd138..3ca1f2c8f56 100644
--- a/lib/Fuzzer/FuzzerUtilWindows.cpp
+++ b/lib/Fuzzer/FuzzerUtilWindows.cpp
@@ -58,6 +58,7 @@ LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo) {
       if (HandlerOpt->HandleFpe)
         Fuzzer::StaticCrashSignalCallback();
       break;
+    // TODO: handle (Options.HandleXfsz)
   }
   return EXCEPTION_CONTINUE_SEARCH;
 }
diff --git a/lib/Fuzzer/test/merge.test b/lib/Fuzzer/test/merge.test
index 1f1810eb019..5c7d30e41ca 100644
--- a/lib/Fuzzer/test/merge.test
+++ b/lib/Fuzzer/test/merge.test
@@ -44,3 +44,11 @@ MERGE_WITH_CRASH: MERGE-OUTER: 3 new files
 # Check that we actually limit the size with max_len
 RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2  -max_len=5 2>&1 | FileCheck %s --check-prefix=MERGE_LEN5
 MERGE_LEN5: MERGE-OUTER: succesfull in 1 attempt(s)
+
+# Check that we honor TMPDIR
+RUN: TMPDIR=DIR_DOES_NOT_EXIST not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=TMPDIR
+TMPDIR: MERGE-OUTER: failed to write to the control file: DIR_DOES_NOT_EXIST/libFuzzerTemp
+
+# Check that we can report an error if file size exceeded
+RUN: (ulimit -f 1; not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=SIGXFSZ)
+SIGXFSZ: ERROR: libFuzzer: file size exceeded
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 42b3a344352..e3e2f9f806c 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -337,12 +337,21 @@ void LTO::addSymbolToGlobalRes(SmallPtrSet<GlobalValue *, 8> &Used,
     if (Res.Prevailing)
       GlobalRes.IRName = GV->getName();
   }
+  // Set the partition to external if we know it is used elsewhere, e.g.
+  // it is visible to a regular object, is referenced from llvm.compiler_used,
+  // or was already recorded as being referenced from a different partition.
   if (Res.VisibleToRegularObj || (GV && Used.count(GV)) ||
       (GlobalRes.Partition != GlobalResolution::Unknown &&
-       GlobalRes.Partition != Partition))
+       GlobalRes.Partition != Partition)) {
     GlobalRes.Partition = GlobalResolution::External;
-  else
+  } else
+    // First recorded reference, save the current partition.
     GlobalRes.Partition = Partition;
+
+  // Flag as visible outside of ThinLTO if visible from a regular object or
+  // if this is a reference in the regular LTO partition.
+  GlobalRes.VisibleOutsideThinLTO |=
+      (Res.VisibleToRegularObj || (Partition == GlobalResolution::RegularLTO));
 }
 
 static void writeToResolutionFile(raw_ostream &OS, InputFile *Input,
@@ -848,6 +857,19 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
     if (!ModuleToDefinedGVSummaries.count(Mod.first))
       ModuleToDefinedGVSummaries.try_emplace(Mod.first);
 
+  // Compute "dead" symbols, we don't want to import/export these!
+  DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
+  for (auto &Res : GlobalResolutions) {
+    if (Res.second.VisibleOutsideThinLTO &&
+        // IRName will be defined if we have seen the prevailing copy of
+        // this value. If not, no need to preserve any ThinLTO copies.
+        !Res.second.IRName.empty())
+      GUIDPreservedSymbols.insert(GlobalValue::getGUID(Res.second.IRName));
+  }
+
+  auto DeadSymbols =
+      computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols);
+
   StringMap<FunctionImporter::ImportMapTy> ImportLists(
       ThinLTO.ModuleMap.size());
   StringMap<FunctionImporter::ExportSetTy> ExportLists(
@@ -856,12 +878,21 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
 
   if (Conf.OptLevel > 0) {
     ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                             ImportLists, ExportLists);
+                             ImportLists, ExportLists, &DeadSymbols);
 
     std::set<GlobalValue::GUID> ExportedGUIDs;
     for (auto &Res : GlobalResolutions) {
-      if (!Res.second.IRName.empty() &&
-          Res.second.Partition == GlobalResolution::External)
+      // First check if the symbol was flagged as having external references.
+      if (Res.second.Partition != GlobalResolution::External)
+        continue;
+      // IRName will be defined if we have seen the prevailing copy of
+      // this value. If not, no need to mark as exported from a ThinLTO
+      // partition (and we can't get the GUID).
+      if (Res.second.IRName.empty())
+        continue;
+      auto GUID = GlobalValue::getGUID(Res.second.IRName);
+      // Mark exported unless index-based analysis determined it to be dead.
+      if (!DeadSymbols.count(GUID))
         ExportedGUIDs.insert(GlobalValue::getGUID(Res.second.IRName));
     }
 
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 880dc3dfae9..66ffe6db29d 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -581,11 +581,18 @@ void ThinLTOCodeGenerator::promote(Module &TheModule,
   StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries;
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
+  // Convert the preserved symbols set from string to GUID
+  auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
+      PreservedSymbols, Triple(TheModule.getTargetTriple()));
+
+  // Compute "dead" symbols, we don't want to import/export these!
+  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists);
+                           ExportLists, &DeadSymbols);
 
   // Resolve LinkOnce/Weak symbols.
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
@@ -594,10 +601,6 @@ void ThinLTOCodeGenerator::promote(Module &TheModule,
   thinLTOResolveWeakForLinkerModule(
       TheModule, ModuleToDefinedGVSummaries[ModuleIdentifier]);
 
-  // Convert the preserved symbols set from string to GUID
-  auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
-      PreservedSymbols, Triple(TheModule.getTargetTriple()));
-
   // Promote the exported values in the index, so that they are promoted
   // in the module.
   auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
@@ -623,11 +626,18 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
   StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
+  // Convert the preserved symbols set from string to GUID
+  auto GUIDPreservedSymbols = computeGUIDPreservedSymbols(
+      PreservedSymbols, Triple(TheModule.getTargetTriple()));
+
+  // Compute "dead" symbols, we don't want to import/export these!
+  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists);
+                           ExportLists, &DeadSymbols);
   auto &ImportList = ImportLists[TheModule.getModuleIdentifier()];
 
   crossImportIntoModule(TheModule, Index, ModuleMap, ImportList);
@@ -697,11 +707,14 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule,
   StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
+  // Compute "dead" symbols, we don't want to import/export these!
+  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists);
+                           ExportLists, &DeadSymbols);
   auto &ExportList = ExportLists[ModuleIdentifier];
 
   // Be friendly and don't nuke totally the module when the client didn't
@@ -836,17 +849,20 @@ void ThinLTOCodeGenerator::run() {
   StringMap<GVSummaryMapTy> ModuleToDefinedGVSummaries(ModuleCount);
   Index->collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
+  // Convert the preserved symbols set from string to GUID, this is needed for
+  // computing the caching hash and the internalization.
+  auto GUIDPreservedSymbols =
+      computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
+
+  // Compute "dead" symbols, we don't want to import/export these!
+  auto DeadSymbols = computeDeadSymbols(*Index, GUIDPreservedSymbols);
+
   // Collect the import/export lists for all modules from the call-graph in the
   // combined index.
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(*Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists);
-
-  // Convert the preserved symbols set from string to GUID, this is needed for
-  // computing the caching hash and the internalization.
-  auto GUIDPreservedSymbols =
-      computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
+                           ExportLists, &DeadSymbols);
 
   // We use a std::map here to be able to have a defined ordering when
   // producing a hash for the cache entry.
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 0c0b498f137..fb8b45166a4 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -205,7 +205,7 @@ APInt& APInt::operator++() {
 
 /// This function subtracts a single "digit" (64-bit word), y, from
 /// the multi-digit integer array, x[], propagating the borrowed 1 value until
-/// no further borrowing is neeeded or it runs out of "digits" in x.  The result
+/// no further borrowing is needed or it runs out of "digits" in x.  The result
 /// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
 /// In other words, if y > x then this function returns 1, otherwise 0.
 /// @returns the borrow out of the subtraction
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index ca344b1dc05..15418ad2fd0 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -90,6 +90,7 @@ add_llvm_library(LLVMSupport
   StringSaver.cpp
   StringRef.cpp
   SystemUtils.cpp
+  TarWriter.cpp
   TargetParser.cpp
   ThreadPool.cpp
   Timer.cpp
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 49d0ed55a71..8a09589aa88 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -474,15 +474,25 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
       break;
 
     // Skylake:
-    case 0x4e:
-      *Type = INTEL_COREI7; // "skylake-avx512"
-      *Subtype = INTEL_COREI7_SKYLAKE_AVX512;
-      break;
-    case 0x5e:
+    case 0x4e: // Skylake mobile
+    case 0x5e: // Skylake desktop
+    case 0x8e: // Kaby Lake mobile
+    case 0x9e: // Kaby Lake desktop
       *Type = INTEL_COREI7; // "skylake"
       *Subtype = INTEL_COREI7_SKYLAKE;
       break;
 
+    // Skylake Xeon:
+    case 0x55:
+      *Type = INTEL_COREI7;
+      // Check that we really have AVX512
+      if (Features & (1 << FEATURE_AVX512)) {
+        *Subtype = INTEL_COREI7_SKYLAKE_AVX512; // "skylake-avx512"
+      } else {
+        *Subtype = INTEL_COREI7_SKYLAKE; // "skylake"
+      }
+      break;
+
     case 0x1c: // Most 45 nm Intel Atom processors
     case 0x26: // 45 nm Atom Lincroft
     case 0x27: // 32 nm Atom Medfield
diff --git a/lib/Support/TarWriter.cpp b/lib/Support/TarWriter.cpp
new file mode 100644
index 00000000000..5fc17d27637
--- /dev/null
+++ b/lib/Support/TarWriter.cpp
@@ -0,0 +1,166 @@
+//===-- TarWriter.cpp - Tar archive file creator --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TarWriter class provides a feature to create a tar archive file.
+//
+// I put emphasis on simplicity over comprehensiveness when implementing this
+// class because we don't need a full-fledged archive file generator in LLVM
+// at the moment.
+//
+// The filename field in the Unix V7 tar header is 100 bytes. Longer filenames
+// are stored using the PAX extension. The PAX header is standardized in
+// POSIX.1-2001.
+//
+// The struct definition of UstarHeader is copied from
+// https://www.freebsd.org/cgi/man.cgi?query=tar&sektion=5
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/TarWriter.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+// Each file in an archive must be aligned to this block size.
+static const int BlockSize = 512;
+
+struct UstarHeader {
+  char Name[100];
+  char Mode[8];
+  char Uid[8];
+  char Gid[8];
+  char Size[12];
+  char Mtime[12];
+  char Checksum[8];
+  char TypeFlag;
+  char Linkname[100];
+  char Magic[6];
+  char Version[2];
+  char Uname[32];
+  char Gname[32];
+  char DevMajor[8];
+  char DevMinor[8];
+  char Prefix[155];
+  char Pad[12];
+};
+static_assert(sizeof(UstarHeader) == BlockSize, "invalid Ustar header");
+
+// A PAX attribute is in the form of "<length> <key>=<value>\n"
+// where <length> is the length of the entire string including
+// the length field itself. An example string is this.
+//
+//   25 ctime=1084839148.1212\n
+//
+// This function create such string.
+static std::string formatPax(StringRef Key, StringRef Val) {
+  int Len = Key.size() + Val.size() + 3; // +3 for " ", "=" and "\n"
+
+  // We need to compute total size twice because appending
+  // a length field could change total size by one.
+  int Total = Len + Twine(Len).str().size();
+  Total = Len + Twine(Total).str().size();
+  return (Twine(Total) + " " + Key + "=" + Val + "\n").str();
+}
+
+// Headers in tar files must be aligned to 512 byte boundaries.
+// This function forwards the current file position to the next boundary.
+static void pad(raw_fd_ostream &OS) {
+  uint64_t Pos = OS.tell();
+  OS.seek(alignTo(Pos, BlockSize));
+}
+
+// Computes a checksum for a tar header.
+static void computeChecksum(UstarHeader &Hdr) {
+  // Before computing a checksum, checksum field must be
+  // filled with space characters.
+  memset(Hdr.Checksum, ' ', sizeof(Hdr.Checksum));
+
+  // Compute a checksum and set it to the checksum field.
+  unsigned Chksum = 0;
+  for (size_t I = 0; I < sizeof(Hdr); ++I)
+    Chksum += reinterpret_cast<uint8_t *>(&Hdr)[I];
+  snprintf(Hdr.Checksum, sizeof(Hdr.Checksum), "%06o", Chksum);
+}
+
+// Create a tar header and write it to a given output stream.
+static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) {
+  // A PAX header consists of a 512-byte header followed
+  // by key-value strings. First, create key-value strings.
+  std::string PaxAttr = formatPax("path", Path);
+
+  // Create a 512-byte header.
+  UstarHeader Hdr = {};
+  snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", PaxAttr.size());
+  Hdr.TypeFlag = 'x';            // PAX magic
+  memcpy(Hdr.Magic, "ustar", 6); // Ustar magic
+  computeChecksum(Hdr);
+
+  // Write them down.
+  OS << StringRef(reinterpret_cast<char *>(&Hdr), sizeof(Hdr));
+  OS << PaxAttr;
+  pad(OS);
+}
+
+// The PAX header is an extended format, so a PAX header needs
+// to be followed by a "real" header.
+static void writeUstarHeader(raw_fd_ostream &OS, StringRef Path, size_t Size) {
+  UstarHeader Hdr = {};
+  memcpy(Hdr.Name, Path.data(), Path.size());
+  memcpy(Hdr.Mode, "0000664", 8);
+  snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", Size);
+  memcpy(Hdr.Magic, "ustar", 6);
+  computeChecksum(Hdr);
+  OS << StringRef(reinterpret_cast<char *>(&Hdr), sizeof(Hdr));
+}
+
+// We want to use '/' as a path separator even on Windows.
+// This function canonicalizes a given path.
+static std::string canonicalize(std::string S) {
+#ifdef LLVM_ON_WIN32
+  std::replace(S.begin(), S.end(), '\\', '/');
+#endif
+  return S;
+}
+
+// Creates a TarWriter instance and returns it.
+Expected<std::unique_ptr<TarWriter>> TarWriter::create(StringRef OutputPath,
+                                                       StringRef BaseDir) {
+  int FD;
+  if (std::error_code EC = openFileForWrite(OutputPath, FD, sys::fs::F_None))
+    return make_error<StringError>("cannot open " + OutputPath, EC);
+  return std::unique_ptr<TarWriter>(new TarWriter(FD, BaseDir));
+}
+
+TarWriter::TarWriter(int FD, StringRef BaseDir)
+    : OS(FD, /*shouldClose=*/true, /*unbuffered=*/false), BaseDir(BaseDir) {}
+
+// Append a given file to an archive.
+void TarWriter::append(StringRef Path, StringRef Data) {
+  // Write Path and Data.
+  std::string S = BaseDir + "/" + canonicalize(Path) + "\0";
+  if (S.size() <= sizeof(UstarHeader::Name)) {
+    writeUstarHeader(OS, S, Data.size());
+  } else {
+    writePaxHeader(OS, S);
+    writeUstarHeader(OS, "", Data.size());
+  }
+
+  OS << Data;
+  pad(OS);
+
+  // POSIX requires tar archives end with two null blocks.
+  // Here, we write the terminator and then seek back, so that
+  // the file being output is terminated correctly at any moment.
+  uint64_t Pos = OS.tell();
+  OS << std::string(BlockSize * 2, '\0');
+  OS.seek(Pos);
+  OS.flush();
+}
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 3750d7f4c09..9752b70644c 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -48,7 +48,7 @@
 // _Unwind_Backtrace function, but on FreeBSD the configure test passes
 // despite the function not existing, and on Android, <unwind.h> conflicts
 // with <link.h>.
-#if defined(__GLIBC__) || defined(__APPLE__)
+#ifdef __GLIBC__
 #include <unwind.h>
 #else
 #undef HAVE__UNWIND_BACKTRACE
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 7666011f75b..17aafa0c3d6 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -110,72 +110,34 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-collect-loh"
 
-static cl::opt<bool>
-PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden,
-                   cl::desc("Restrict analysis to registers invovled"
-                            " in LOHs"),
-                   cl::init(true));
-
-static cl::opt<bool>
-BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden,
-                    cl::desc("Restrict analysis at basic block scope"),
-                    cl::init(true));
-
 STATISTIC(NumADRPSimpleCandidate,
           "Number of simplifiable ADRP dominate by another");
-#ifndef NDEBUG
-STATISTIC(NumADRPComplexCandidate2,
-          "Number of simplifiable ADRP reachable by 2 defs");
-STATISTIC(NumADRPComplexCandidate3,
-          "Number of simplifiable ADRP reachable by 3 defs");
-STATISTIC(NumADRPComplexCandidateOther,
-          "Number of simplifiable ADRP reachable by 4 or more defs");
-STATISTIC(NumADDToSTRWithImm,
-          "Number of simplifiable STR with imm reachable by ADD");
-STATISTIC(NumLDRToSTRWithImm,
-          "Number of simplifiable STR with imm reachable by LDR");
 STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
 STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
-STATISTIC(NumADDToLDRWithImm,
-          "Number of simplifiable LDR with imm reachable by ADD");
-STATISTIC(NumLDRToLDRWithImm,
-          "Number of simplifiable LDR with imm reachable by LDR");
 STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
 STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
-#endif // NDEBUG
 STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
-#ifndef NDEBUG
-STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
-STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
-STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
-STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
-#endif // NDEBUG
 STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
-STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
 
 #define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)"
 
 namespace {
+
 struct AArch64CollectLOH : public MachineFunctionPass {
   static char ID;
-  AArch64CollectLOH() : MachineFunctionPass(ID) {
-    initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry());
-  }
+  AArch64CollectLOH() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -187,351 +149,57 @@ struct AArch64CollectLOH : public MachineFunctionPass {
   StringRef getPassName() const override { return AARCH64_COLLECT_LOH_NAME; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineDominatorTree>();
+    AU.setPreservesAll();
   }
-
-private:
 };
 
-/// A set of MachineInstruction.
-typedef SetVector<const MachineInstr *> SetOfMachineInstr;
-/// Map a basic block to a set of instructions per register.
-/// This is used to represent the exposed uses of a basic block
-/// per register.
-typedef MapVector<const MachineBasicBlock *,
-                  std::unique_ptr<SetOfMachineInstr[]>>
-BlockToSetOfInstrsPerColor;
-/// Map a basic block to an instruction per register.
-/// This is used to represent the live-out definitions of a basic block
-/// per register.
-typedef MapVector<const MachineBasicBlock *,
-                  std::unique_ptr<const MachineInstr *[]>>
-BlockToInstrPerColor;
-/// Map an instruction to a set of instructions. Used to represent the
-/// mapping def to reachable uses or use to definitions.
-typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
-/// Map a basic block to a BitVector.
-/// This is used to record the kill registers per basic block.
-typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
-
-/// Map a register to a dense id.
-typedef DenseMap<unsigned, unsigned> MapRegToId;
-/// Map a dense id to a register. Used for debug purposes.
-typedef SmallVector<unsigned, 32> MapIdToReg;
-} // end anonymous namespace.
-
 char AArch64CollectLOH::ID = 0;
 
-INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
-                      AARCH64_COLLECT_LOH_NAME, false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
-                    AARCH64_COLLECT_LOH_NAME, false, false)
-
-/// Given a couple (MBB, reg) get the corresponding set of instruction from
-/// the given "sets".
-/// If this couple does not reference any set, an empty set is added to "sets"
-/// for this couple and returned.
-/// \param nbRegs is used internally allocate some memory. It must be consistent
-/// with the way sets is used.
-static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
-                                 const MachineBasicBlock &MBB, unsigned reg,
-                                 unsigned nbRegs) {
-  SetOfMachineInstr *result;
-  BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
-  if (it != sets.end())
-    result = it->second.get();
-  else
-    result = (sets[&MBB] = make_unique<SetOfMachineInstr[]>(nbRegs)).get();
-
-  return result[reg];
-}
-
-/// Given a couple (reg, MI) get the corresponding set of instructions from the
-/// the given "sets".
-/// This is used to get the uses record in sets of a definition identified by
-/// MI and reg, i.e., MI defines reg.
-/// If the couple does not reference anything, an empty set is added to
-/// "sets[reg]".
-/// \pre set[reg] is valid.
-static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
-                                  const MachineInstr &MI) {
-  return sets[reg][&MI];
-}
-
-/// Same as getUses but does not modify the input map: sets.
-/// \return NULL if the couple (reg, MI) is not in sets.
-static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
-                                        const MachineInstr &MI) {
-  InstrToInstrs::const_iterator Res = sets[reg].find(&MI);
-  if (Res != sets[reg].end())
-    return &(Res->second);
-  return nullptr;
-}
-
-/// Initialize the reaching definition algorithm:
-/// For each basic block BB in MF, record:
-/// - its kill set.
-/// - its reachable uses (uses that are exposed to BB's predecessors).
-/// - its the generated definitions.
-/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
-/// the list of uses of exposed defintions.
-/// \param ADRPMode specifies to only consider ADRP instructions for generated
-/// definition. It also consider definitions of ADRP instructions as uses and
-/// ignore other uses. The ADRPMode is used to collect the information for LHO
-/// that involve ADRP operation only.
-static void initReachingDef(const MachineFunction &MF,
-                            InstrToInstrs *ColorOpToReachedUses,
-                            BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
-                            BlockToSetOfInstrsPerColor &ReachableUses,
-                            const MapRegToId &RegToId,
-                            const MachineInstr *DummyOp, bool ADRPMode) {
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  unsigned NbReg = RegToId.size();
-
-  for (const MachineBasicBlock &MBB : MF) {
-    auto &BBGen = Gen[&MBB];
-    BBGen = make_unique<const MachineInstr *[]>(NbReg);
-    std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr);
-
-    BitVector &BBKillSet = Kill[&MBB];
-    BBKillSet.resize(NbReg);
-    for (const MachineInstr &MI : MBB) {
-      bool IsADRP = MI.getOpcode() == AArch64::ADRP;
-
-      // Process uses first.
-      if (IsADRP || !ADRPMode)
-        for (const MachineOperand &MO : MI.operands()) {
-          // Treat ADRP def as use, as the goal of the analysis is to find
-          // ADRP defs reached by other ADRP defs.
-          if (!MO.isReg() || (!ADRPMode && !MO.isUse()) ||
-              (ADRPMode && (!IsADRP || !MO.isDef())))
-            continue;
-          unsigned CurReg = MO.getReg();
-          MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
-          if (ItCurRegId == RegToId.end())
-            continue;
-          CurReg = ItCurRegId->second;
-
-          // if CurReg has not been defined, this use is reachable.
-          if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
-            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI);
-          // current basic block definition for this color, if any, is in Gen.
-          if (BBGen[CurReg])
-            getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI);
-        }
-
-      // Process clobbers.
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isRegMask())
-          continue;
-        // Clobbers kill the related colors.
-        const uint32_t *PreservedRegs = MO.getRegMask();
-
-        // Set generated regs.
-        for (const auto &Entry : RegToId) {
-          unsigned Reg = Entry.second;
-          // Use the global register ID when querying APIs external to this
-          // pass.
-          if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
-            // Do not register clobbered definition for no ADRP.
-            // This definition is not used anyway (otherwise register
-            // allocation is wrong).
-            BBGen[Reg] = ADRPMode ? &MI : nullptr;
-            BBKillSet.set(Reg);
-          }
-        }
-      }
-
-      // Process register defs.
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isReg() || !MO.isDef())
-          continue;
-        unsigned CurReg = MO.getReg();
-        MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
-        if (ItCurRegId == RegToId.end())
-          continue;
-
-        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
-          MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
-          // If this alias has not been recorded, then it is not interesting
-          // for the current analysis.
-          // We can end up in this situation because of tuple registers.
-          // E.g., Let say we are interested in S1. When we register
-          // S1, we will also register its aliases and in particular
-          // the tuple Q1_Q2.
-          // Now, when we encounter Q1_Q2, we will look through its aliases
-          // and will find that S2 is not registered.
-          if (ItRegId == RegToId.end())
-            continue;
-
-          BBKillSet.set(ItRegId->second);
-          BBGen[ItRegId->second] = &MI;
-        }
-        BBGen[ItCurRegId->second] = &MI;
-      }
-    }
-
-    // If we restrict our analysis to basic block scope, conservatively add a
-    // dummy
-    // use for each generated value.
-    if (!ADRPMode && DummyOp && !MBB.succ_empty())
-      for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
-        if (BBGen[CurReg])
-          getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp);
-  }
-}
-
-/// Reaching def core algorithm:
-/// while an Out has changed
-///    for each bb
-///       for each color
-///           In[bb][color] = U Out[bb.predecessors][color]
-///           insert reachableUses[bb][color] in each in[bb][color]
-///                 op.reachedUses
-///
-///           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
-static void reachingDefAlgorithm(const MachineFunction &MF,
-                                 InstrToInstrs *ColorOpToReachedUses,
-                                 BlockToSetOfInstrsPerColor &In,
-                                 BlockToSetOfInstrsPerColor &Out,
-                                 BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
-                                 BlockToSetOfInstrsPerColor &ReachableUses,
-                                 unsigned NbReg) {
-  bool HasChanged;
-  do {
-    HasChanged = false;
-    for (const MachineBasicBlock &MBB : MF) {
-      unsigned CurReg;
-      for (CurReg = 0; CurReg < NbReg; ++CurReg) {
-        SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
-        SetOfMachineInstr &BBReachableUses =
-            getSet(ReachableUses, MBB, CurReg, NbReg);
-        SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
-        unsigned Size = BBOutSet.size();
-        //   In[bb][color] = U Out[bb.predecessors][color]
-        for (const MachineBasicBlock *PredMBB : MBB.predecessors()) {
-          SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
-          BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
-        }
-        //   insert reachableUses[bb][color] in each in[bb][color] op.reachedses
-        for (const MachineInstr *MI : BBInSet) {
-          SetOfMachineInstr &OpReachedUses =
-              getUses(ColorOpToReachedUses, CurReg, *MI);
-          OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
-        }
-        //           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
-        if (!Kill[&MBB].test(CurReg))
-          BBOutSet.insert(BBInSet.begin(), BBInSet.end());
-        if (Gen[&MBB][CurReg])
-          BBOutSet.insert(Gen[&MBB][CurReg]);
-        HasChanged |= BBOutSet.size() != Size;
-      }
-    }
-  } while (HasChanged);
-}
-
-/// Reaching definition algorithm.
-/// \param MF function on which the algorithm will operate.
-/// \param[out] ColorOpToReachedUses will contain the result of the reaching
-/// def algorithm.
-/// \param ADRPMode specify whether the reaching def algorithm should be tuned
-/// for ADRP optimization. \see initReachingDef for more details.
-/// \param DummyOp if not NULL, the algorithm will work at
-/// basic block scope and will set for every exposed definition a use to
-/// @p DummyOp.
-/// \pre ColorOpToReachedUses is an array of at least number of registers of
-/// InstrToInstrs.
-static void reachingDef(const MachineFunction &MF,
-                        InstrToInstrs *ColorOpToReachedUses,
-                        const MapRegToId &RegToId, bool ADRPMode = false,
-                        const MachineInstr *DummyOp = nullptr) {
-  // structures:
-  // For each basic block.
-  // Out: a set per color of definitions that reach the
-  //      out boundary of this block.
-  // In: Same as Out but for in boundary.
-  // Gen: generated color in this block (one operation per color).
-  // Kill: register set of killed color in this block.
-  // ReachableUses: a set per color of uses (operation) reachable
-  //                for "In" definitions.
-  BlockToSetOfInstrsPerColor Out, In, ReachableUses;
-  BlockToInstrPerColor Gen;
-  BlockToRegSet Kill;
-
-  // Initialize Gen, kill and reachableUses.
-  initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
-                  DummyOp, ADRPMode);
-
-  // Algo.
-  if (!DummyOp)
-    reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
-                         ReachableUses, RegToId.size());
-}
+} // end anonymous namespace.
 
-#ifndef NDEBUG
-/// print the result of the reaching definition algorithm.
-static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
-                             unsigned NbReg, const TargetRegisterInfo *TRI,
-                             const MapIdToReg &IdToReg) {
-  unsigned CurReg;
-  for (CurReg = 0; CurReg < NbReg; ++CurReg) {
-    if (ColorOpToReachedUses[CurReg].empty())
-      continue;
-    DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
+INITIALIZE_PASS(AArch64CollectLOH, "aarch64-collect-loh",
+                AARCH64_COLLECT_LOH_NAME, false, false)
 
-    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
-      DEBUG(dbgs() << "Def:\n");
-      DEBUG(DefsIt.first->print(dbgs()));
-      DEBUG(dbgs() << "Reachable uses:\n");
-      for (const MachineInstr *MI : DefsIt.second) {
-        DEBUG(MI->print(dbgs()));
-      }
-    }
+static bool canAddBePartOfLOH(const MachineInstr &MI) {
+  // Check immediate to see if the immediate is an address.
+  switch (MI.getOperand(2).getType()) {
+  default:
+    return false;
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_BlockAddress:
+    return true;
   }
 }
-#endif // NDEBUG
 
 /// Answer the following question: Can Def be one of the definition
 /// involved in a part of a LOH?
-static bool canDefBePartOfLOH(const MachineInstr *Def) {
-  unsigned Opc = Def->getOpcode();
+static bool canDefBePartOfLOH(const MachineInstr &MI) {
   // Accept ADRP, ADDLow and LOADGot.
-  switch (Opc) {
+  switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::ADRP:
     return true;
   case AArch64::ADDXri:
-    // Check immediate to see if the immediate is an address.
-    switch (Def->getOperand(2).getType()) {
-    default:
-      return false;
-    case MachineOperand::MO_GlobalAddress:
-    case MachineOperand::MO_JumpTableIndex:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_BlockAddress:
-      return true;
-    }
+    return canAddBePartOfLOH(MI);
   case AArch64::LDRXui:
     // Check immediate to see if the immediate is an address.
-    switch (Def->getOperand(2).getType()) {
+    switch (MI.getOperand(2).getType()) {
     default:
       return false;
     case MachineOperand::MO_GlobalAddress:
-      return true;
+      return MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT;
     }
   }
-  // Unreachable.
-  return false;
 }
 
 /// Check whether the given instruction can the end of a LOH chain involving a
 /// store.
-static bool isCandidateStore(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
+static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
+  switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::STRBBui:
@@ -543,109 +211,19 @@ static bool isCandidateStore(const MachineInstr *Instr) {
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
+    // We can only optimize the index operand.
     // In case we have str xA, [xA, #imm], this is two different uses
     // of xA and we cannot fold, otherwise the xA stored may be wrong,
     // even if #imm == 0.
-    if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
-      return true;
-  }
-  return false;
-}
-
-/// Given the result of a reaching definition algorithm in ColorOpToReachedUses,
-/// Build the Use to Defs information and filter out obvious non-LOH candidates.
-/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
-/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
-/// i.e., no simple chain.
-/// \param ADRPMode -- \see initReachingDef.
-static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
-                              const InstrToInstrs *ColorOpToReachedUses,
-                              const MapRegToId &RegToId,
-                              bool ADRPMode = false) {
-
-  SetOfMachineInstr NotCandidate;
-  unsigned NbReg = RegToId.size();
-  MapRegToId::const_iterator EndIt = RegToId.end();
-  for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
-    // If this color is never defined, continue.
-    if (ColorOpToReachedUses[CurReg].empty())
-      continue;
-
-    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
-      for (const MachineInstr *MI : DefsIt.second) {
-        const MachineInstr *Def = DefsIt.first;
-        MapRegToId::const_iterator It;
-        // if all the reaching defs are not adrp, this use will not be
-        // simplifiable.
-        if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) ||
-            (!ADRPMode && !canDefBePartOfLOH(Def)) ||
-            (!ADRPMode && isCandidateStore(MI) &&
-             // store are LOH candidate iff the end of the chain is used as
-             // base.
-             ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt ||
-              It->second != CurReg))) {
-          NotCandidate.insert(MI);
-          continue;
-        }
-        // Do not consider self reaching as a simplifiable case for ADRP.
-        if (!ADRPMode || MI != DefsIt.first) {
-          UseToReachingDefs[MI].insert(DefsIt.first);
-          // If UsesIt has several reaching definitions, it is not
-          // candidate for simplificaton in non-ADRPMode.
-          if (!ADRPMode && UseToReachingDefs[MI].size() > 1)
-            NotCandidate.insert(MI);
-        }
-      }
-    }
-  }
-  for (const MachineInstr *Elem : NotCandidate) {
-    DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
-    // It would have been better if we could just remove the entry
-    // from the map.  Because of that, we have to filter the garbage
-    // (second.empty) in the subsequence analysis.
-    UseToReachingDefs[Elem].clear();
-  }
-}
-
-/// Based on the use to defs information (in ADRPMode), compute the
-/// opportunities of LOH ADRP-related.
-static void computeADRP(const InstrToInstrs &UseToDefs,
-                        AArch64FunctionInfo &AArch64FI,
-                        const MachineDominatorTree *MDT) {
-  DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
-  for (const auto &Entry : UseToDefs) {
-    unsigned Size = Entry.second.size();
-    if (Size == 0)
-      continue;
-    if (Size == 1) {
-      const MachineInstr *L2 = *Entry.second.begin();
-      const MachineInstr *L1 = Entry.first;
-      if (!MDT->dominates(L2, L1)) {
-        DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
-                     << '\n');
-        continue;
-      }
-      DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
-      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, {L2, L1});
-      ++NumADRPSimpleCandidate;
-    }
-#ifndef NDEBUG
-    else if (Size == 2)
-      ++NumADRPComplexCandidate2;
-    else if (Size == 3)
-      ++NumADRPComplexCandidate3;
-    else
-      ++NumADRPComplexCandidateOther;
-#endif
-    // if Size < 1, the use should have been removed from the candidates
-    assert(Size >= 1 && "No reaching defs for that use!");
+    return MI.getOperandNo(&MO) == 1 &&
+           MI.getOperand(0).getReg() != MI.getOperand(1).getReg();
   }
 }
 
 /// Check whether the given instruction can be the end of a LOH chain
 /// involving a load.
-static bool isCandidateLoad(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
+static bool isCandidateLoad(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::LDRSBWui:
@@ -660,17 +238,13 @@ static bool isCandidateLoad(const MachineInstr *Instr) {
   case AArch64::LDRSui:
   case AArch64::LDRDui:
   case AArch64::LDRQui:
-    if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)
-      return false;
-    return true;
+    return !(MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT);
   }
-  // Unreachable.
-  return false;
 }
 
 /// Check whether the given instruction can load a litteral.
-static bool supportLoadFromLiteral(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
+static bool supportLoadFromLiteral(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::LDRSWui:
@@ -681,353 +255,233 @@ static bool supportLoadFromLiteral(const MachineInstr *Instr) {
   case AArch64::LDRQui:
     return true;
   }
-  // Unreachable.
-  return false;
 }
 
-/// Check whether the given instruction is a LOH candidate.
-/// \param UseToDefs is used to check that Instr is at the end of LOH supported
-/// chain.
-/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
-/// already been filtered out.
-static bool isCandidate(const MachineInstr *Instr,
-                        const InstrToInstrs &UseToDefs,
-                        const MachineDominatorTree *MDT) {
-  if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
-    return false;
+/// Number of GPR registers traked by mapRegToGPRIndex()
+static const unsigned N_GPR_REGS = 31;
+/// Map register number to index from 0-30.
+static int mapRegToGPRIndex(MCPhysReg Reg) {
+  static_assert(AArch64::X28 - AArch64::X0 + 3 == N_GPR_REGS, "Number of GPRs");
+  static_assert(AArch64::W30 - AArch64::W0 + 1 == N_GPR_REGS, "Number of GPRs");
+  if (AArch64::X0 <= Reg && Reg <= AArch64::X28)
+    return Reg - AArch64::X0;
+  if (AArch64::W0 <= Reg && Reg <= AArch64::W30)
+    return Reg - AArch64::W0;
+  // TableGen gives "FP" and "LR" an index not adjacent to X28 so we have to
+  // handle them as special cases.
+  if (Reg == AArch64::FP)
+    return 29;
+  if (Reg == AArch64::LR)
+    return 30;
+  return -1;
+}
 
-  const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
-  if (Def->getOpcode() != AArch64::ADRP) {
-    // At this point, Def is ADDXri or LDRXui of the right type of
-    // symbol, because we filtered out the uses that were not defined
-    // by these kind of instructions (+ ADRP).
+/// State tracked per register.
+/// The main algorithm walks backwards over a basic block maintaining this
+/// datastructure for each tracked general purpose register.
+struct LOHInfo {
+  MCLOHType Type : 8;           ///< "Best" type of LOH possible.
+  bool IsCandidate : 1;         ///< Possible LOH candidate.
+  bool OneUser : 1;             ///< Found exactly one user (yet).
+  bool MultiUsers : 1;          ///< Found multiple users.
+  const MachineInstr *MI0;      ///< First instruction involved in the LOH.
+  const MachineInstr *MI1;      ///< Second instruction involved in the LOH
+                                ///  (if any).
+  const MachineInstr *LastADRP; ///< Last ADRP in same register.
+};
 
-    // Check if this forms a simple chain: each intermediate node must
-    // dominates the next one.
-    if (!MDT->dominates(Def, Instr))
-      return false;
-    // Move one node up in the simple chain.
-    if (UseToDefs.find(Def) ==
-            UseToDefs.end()
-            // The map may contain garbage we have to ignore.
-        ||
-        UseToDefs.find(Def)->second.empty())
-      return false;
-    Instr = Def;
-    Def = *UseToDefs.find(Def)->second.begin();
+/// Update state \p Info given \p MI uses the tracked register.
+static void handleUse(const MachineInstr &MI, const MachineOperand &MO,
+                      LOHInfo &Info) {
+  // We have multiple uses if we already found one before.
+  if (Info.MultiUsers || Info.OneUser) {
+    Info.IsCandidate = false;
+    Info.MultiUsers = true;
+    return;
   }
-  // Check if we reached the top of the simple chain:
-  // - top is ADRP.
-  // - check the simple chain property: each intermediate node must
-  // dominates the next one.
-  if (Def->getOpcode() == AArch64::ADRP)
-    return MDT->dominates(Def, Instr);
-  return false;
-}
-
-static bool registerADRCandidate(const MachineInstr &Use,
-                                 const InstrToInstrs &UseToDefs,
-                                 const InstrToInstrs *DefsPerColorToUses,
-                                 AArch64FunctionInfo &AArch64FI,
-                                 SetOfMachineInstr *InvolvedInLOHs,
-                                 const MapRegToId &RegToId) {
-  // Look for opportunities to turn ADRP -> ADD or
-  // ADRP -> LDR GOTPAGEOFF into ADR.
-  // If ADRP has more than one use. Give up.
-  if (Use.getOpcode() != AArch64::ADDXri &&
-      (Use.getOpcode() != AArch64::LDRXui ||
-       !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT)))
-    return false;
-  InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
-  // The map may contain garbage that we need to ignore.
-  if (It == UseToDefs.end() || It->second.empty())
-    return false;
-  const MachineInstr &Def = **It->second.begin();
-  if (Def.getOpcode() != AArch64::ADRP)
-    return false;
-  // Check the number of users of ADRP.
-  const SetOfMachineInstr *Users =
-      getUses(DefsPerColorToUses,
-              RegToId.find(Def.getOperand(0).getReg())->second, Def);
-  if (Users->size() > 1) {
-    ++NumADRComplexCandidate;
-    return false;
+  Info.OneUser = true;
+
+  // Start new LOHInfo if applicable.
+  if (isCandidateLoad(MI)) {
+    Info.Type = MCLOH_AdrpLdr;
+    Info.IsCandidate = true;
+    Info.MI0 = &MI;
+    // Note that even this is AdrpLdr now, we can switch to a Ldr variant
+    // later.
+  } else if (isCandidateStore(MI, MO)) {
+    Info.Type = MCLOH_AdrpAddStr;
+    Info.IsCandidate = true;
+    Info.MI0 = &MI;
+    Info.MI1 = nullptr;
+  } else if (MI.getOpcode() == AArch64::ADDXri) {
+    Info.Type = MCLOH_AdrpAdd;
+    Info.IsCandidate = true;
+    Info.MI0 = &MI;
+  } else if (MI.getOpcode() == AArch64::LDRXui &&
+             MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) {
+    Info.Type = MCLOH_AdrpLdrGot;
+    Info.IsCandidate = true;
+    Info.MI0 = &MI;
   }
-  ++NumADRSimpleCandidate;
-  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) &&
-         "ADRP already involved in LOH.");
-  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) &&
-         "ADD already involved in LOH.");
-  DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
-
-  AArch64FI.addLOHDirective(
-      Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd : MCLOH_AdrpLdrGot,
-      {&Def, &Use});
-  return true;
 }
 
-/// Based on the use to defs information (in non-ADRPMode), compute the
-/// opportunities of LOH non-ADRP-related
-static void computeOthers(const InstrToInstrs &UseToDefs,
-                          const InstrToInstrs *DefsPerColorToUses,
-                          AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId,
-                          const MachineDominatorTree *MDT) {
-  SetOfMachineInstr *InvolvedInLOHs = nullptr;
-#ifndef NDEBUG
-  SetOfMachineInstr InvolvedInLOHsStorage;
-  InvolvedInLOHs = &InvolvedInLOHsStorage;
-#endif // NDEBUG
-  DEBUG(dbgs() << "*** Compute LOH for Others\n");
-  // ADRP -> ADD/LDR -> LDR/STR pattern.
-  // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
+/// Update state \p Info given the tracked register is clobbered.
+static void handleClobber(LOHInfo &Info) {
+  Info.IsCandidate = false;
+  Info.OneUser = false;
+  Info.MultiUsers = false;
+  Info.LastADRP = nullptr;
+}
 
-  // FIXME: When the statistics are not important,
-  // This initial filtering loop can be merged into the next loop.
-  // Currently, we didn't do it to have the same code for both DEBUG and
-  // NDEBUG builds. Indeed, the iterator of the second loop would need
-  // to be changed.
-  SetOfMachineInstr PotentialCandidates;
-  SetOfMachineInstr PotentialADROpportunities;
-  for (auto &Use : UseToDefs) {
-    // If no definition is available, this is a non candidate.
-    if (Use.second.empty())
-      continue;
-    // Keep only instructions that are load or store and at the end of
-    // a ADRP -> ADD/LDR/Nothing chain.
-    // We already filtered out the no-chain cases.
-    if (!isCandidate(Use.first, UseToDefs, MDT)) {
-      PotentialADROpportunities.insert(Use.first);
-      continue;
+/// Update state \p Info given that \p MI is possibly the middle instruction
+/// of an LOH involving 3 instructions.
+static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo,
+                             LOHInfo &OpInfo) {
+  if (!DefInfo.IsCandidate || (&DefInfo != &OpInfo && OpInfo.OneUser))
+    return false;
+  // Copy LOHInfo for dest register to LOHInfo for source register.
+  if (&DefInfo != &OpInfo) {
+    OpInfo = DefInfo;
+    // Invalidate \p DefInfo because we track it in \p OpInfo now.
+    handleClobber(DefInfo);
+  } else
+    DefInfo.LastADRP = nullptr;
+
+  // Advance state machine.
+  assert(OpInfo.IsCandidate && "Expect valid state");
+  if (MI.getOpcode() == AArch64::ADDXri && canAddBePartOfLOH(MI)) {
+    if (OpInfo.Type == MCLOH_AdrpLdr) {
+      OpInfo.Type = MCLOH_AdrpAddLdr;
+      OpInfo.IsCandidate = true;
+      OpInfo.MI1 = &MI;
+      return true;
+    } else if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) {
+      OpInfo.Type = MCLOH_AdrpAddStr;
+      OpInfo.IsCandidate = true;
+      OpInfo.MI1 = &MI;
+      return true;
     }
-    PotentialCandidates.insert(Use.first);
-  }
-
-  // Make the following distinctions for statistics as the linker does
-  // know how to decode instructions:
-  // - ADD/LDR/Nothing make there different patterns.
-  // - LDR/STR make two different patterns.
-  // Hence, 6 - 1 base patterns.
-  // (because ADRP-> Nothing -> STR is not simplifiable)
-
-  // The linker is only able to have a simple semantic, i.e., if pattern A
-  // do B.
-  // However, we want to see the opportunity we may miss if we were able to
-  // catch more complex cases.
-
-  // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
-  // A potential candidate becomes a candidate, if its current immediate
-  // operand is zero and all nodes of the chain have respectively only one user
-#ifndef NDEBUG
-  SetOfMachineInstr DefsOfPotentialCandidates;
-#endif
-  for (const MachineInstr *Candidate : PotentialCandidates) {
-    // Get the definition of the candidate i.e., ADD or LDR.
-    const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
-    // Record the elements of the chain.
-    const MachineInstr *L1 = Def;
-    const MachineInstr *L2 = nullptr;
-    unsigned ImmediateDefOpc = Def->getOpcode();
-    if (Def->getOpcode() != AArch64::ADRP) {
-      // Check the number of users of this node.
-      const SetOfMachineInstr *Users =
-          getUses(DefsPerColorToUses,
-                  RegToId.find(Def->getOperand(0).getReg())->second, *Def);
-      if (Users->size() > 1) {
-#ifndef NDEBUG
-        // if all the uses of this def are in potential candidate, this is
-        // a complex candidate of level 2.
-        bool IsLevel2 = true;
-        for (const MachineInstr *MI : *Users) {
-          if (!PotentialCandidates.count(MI)) {
-            ++NumTooCplxLvl2;
-            IsLevel2 = false;
-            break;
-          }
-        }
-        if (IsLevel2)
-          ++NumCplxLvl2;
-#endif // NDEBUG
-        PotentialADROpportunities.insert(Def);
-        continue;
-      }
-      L2 = Def;
-      Def = *UseToDefs.find(Def)->second.begin();
-      L1 = Def;
-    } // else the element in the middle of the chain is nothing, thus
-      // Def already contains the first element of the chain.
-
-    // Check the number of users of the first node in the chain, i.e., ADRP
-    const SetOfMachineInstr *Users =
-        getUses(DefsPerColorToUses,
-                RegToId.find(Def->getOperand(0).getReg())->second, *Def);
-    if (Users->size() > 1) {
-#ifndef NDEBUG
-      // if all the uses of this def are in the defs of the potential candidate,
-      // this is a complex candidate of level 1
-      if (DefsOfPotentialCandidates.empty()) {
-        // lazy init
-        DefsOfPotentialCandidates = PotentialCandidates;
-        for (const MachineInstr *Candidate : PotentialCandidates) {
-          if (!UseToDefs.find(Candidate)->second.empty())
-            DefsOfPotentialCandidates.insert(
-                *UseToDefs.find(Candidate)->second.begin());
-        }
-      }
-      bool Found = false;
-      for (auto &Use : *Users) {
-        if (!DefsOfPotentialCandidates.count(Use)) {
-          ++NumTooCplxLvl1;
-          Found = true;
-          break;
-        }
-      }
-      if (!Found)
-        ++NumCplxLvl1;
-#endif // NDEBUG
-      continue;
+  } else {
+    assert(MI.getOpcode() == AArch64::LDRXui && "Expect LDRXui");
+    assert((MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) &&
+           "Expected GOT relocation");
+    if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) {
+      OpInfo.Type = MCLOH_AdrpLdrGotStr;
+      OpInfo.IsCandidate = true;
+      OpInfo.MI1 = &MI;
+      return true;
+    } else if (OpInfo.Type == MCLOH_AdrpLdr) {
+      OpInfo.Type = MCLOH_AdrpLdrGotLdr;
+      OpInfo.IsCandidate = true;
+      OpInfo.MI1 = &MI;
+      return true;
     }
+  }
+  return false;
+}
 
-    bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
-    // If the chain is three instructions long and ldr is the second element,
-    // then this ldr must load form GOT, otherwise this is not a correct chain.
-    if (L2 && !IsL2Add &&
-        !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT))
-      continue;
-    SmallVector<const MachineInstr *, 3> Args;
-    MCLOHType Kind;
-    if (isCandidateLoad(Candidate)) {
-      if (!L2) {
-        // At this point, the candidate LOH indicates that the ldr instruction
-        // may use a direct access to the symbol. There is not such encoding
-        // for loads of byte and half.
-        if (!supportLoadFromLiteral(Candidate))
-          continue;
+/// Update state when seeing and ADRP instruction.
+static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
+                       LOHInfo &Info) {
+  if (Info.LastADRP != nullptr) {
+    DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n" << '\t' << MI << '\t'
+                 << *Info.LastADRP);
+    AFI.addLOHDirective(MCLOH_AdrpAdrp, {&MI, Info.LastADRP});
+    ++NumADRPSimpleCandidate;
+  }
 
-        DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
-                     << '\n');
-        Kind = MCLOH_AdrpLdr;
-        Args.push_back(L1);
-        Args.push_back(Candidate);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
+  // Produce LOH directive if possible.
+  if (Info.IsCandidate) {
+    switch (Info.Type) {
+    case MCLOH_AdrpAdd:
+      DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n" << '\t' << MI << '\t'
+                   << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpAdd, {&MI, Info.MI0});
+      ++NumADRSimpleCandidate;
+      break;
+    case MCLOH_AdrpLdr:
+      if (supportLoadFromLiteral(*Info.MI0)) {
+        DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n" << '\t' << MI << '\t'
+                     << *Info.MI0);
+        AFI.addLOHDirective(MCLOH_AdrpLdr, {&MI, Info.MI0});
         ++NumADRPToLDR;
-      } else {
-        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
-                     << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
-                     << '\n');
-
-        Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
-        Args.push_back(L1);
-        Args.push_back(L2);
-        Args.push_back(Candidate);
-
-        PotentialADROpportunities.remove(L2);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
-               "L2 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
-#ifndef NDEBUG
-        // get the immediate of the load
-        if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == AArch64::ADDXri)
-            ++NumADDToLDR;
-          else
-            ++NumLDRToLDR;
-        else if (ImmediateDefOpc == AArch64::ADDXri)
-          ++NumADDToLDRWithImm;
-        else
-          ++NumLDRToLDRWithImm;
-#endif // NDEBUG
       }
-    } else {
-      if (ImmediateDefOpc == AArch64::ADRP)
-        continue;
-      else {
-
-        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
-                     << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
-                     << '\n');
-
-        Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
-        Args.push_back(L1);
-        Args.push_back(L2);
-        Args.push_back(Candidate);
-
-        PotentialADROpportunities.remove(L2);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
-               "L2 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
-#ifndef NDEBUG
-        // get the immediate of the store
-        if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == AArch64::ADDXri)
-            ++NumADDToSTR;
-          else
-            ++NumLDRToSTR;
-        else if (ImmediateDefOpc == AArch64::ADDXri)
-          ++NumADDToSTRWithImm;
-        else
-          ++NumLDRToSTRWithImm;
-#endif // DEBUG
+      break;
+    case MCLOH_AdrpAddLdr:
+      DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n" << '\t' << MI << '\t'
+                   << *Info.MI1 << '\t' << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpAddLdr, {&MI, Info.MI1, Info.MI0});
+      ++NumADDToLDR;
+      break;
+    case MCLOH_AdrpAddStr:
+      if (Info.MI1 != nullptr) {
+        DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n" << '\t' << MI << '\t'
+                     << *Info.MI1 << '\t' << *Info.MI0);
+        AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0});
+        ++NumADDToSTR;
       }
+      break;
+    case MCLOH_AdrpLdrGotLdr:
+      DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n" << '\t' << MI << '\t'
+                   << *Info.MI1 << '\t' << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpLdrGotLdr, {&MI, Info.MI1, Info.MI0});
+      ++NumLDRToLDR;
+      break;
+    case MCLOH_AdrpLdrGotStr:
+      DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotStr:\n" << '\t' << MI << '\t'
+                   << *Info.MI1 << '\t' << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpLdrGotStr, {&MI, Info.MI1, Info.MI0});
+      ++NumLDRToSTR;
+      break;
+    case MCLOH_AdrpLdrGot:
+      DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGot:\n" << '\t' << MI << '\t'
+                   << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpLdrGot, {&MI, Info.MI0});
+      break;
+    case MCLOH_AdrpAdrp:
+      llvm_unreachable("MCLOH_AdrpAdrp not used in state machine");
     }
-    AArch64FI.addLOHDirective(Kind, Args);
   }
 
-  // Now, we grabbed all the big patterns, check ADR opportunities.
-  for (const MachineInstr *Candidate : PotentialADROpportunities)
-    registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI,
-                         InvolvedInLOHs, RegToId);
+  handleClobber(Info);
+  Info.LastADRP = &MI;
 }
 
-/// Look for every register defined by potential LOHs candidates.
-/// Map these registers with dense id in @p RegToId and vice-versa in
-/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
-static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId,
-                               MapIdToReg &IdToReg,
-                               const TargetRegisterInfo *TRI) {
-  unsigned CurRegId = 0;
-  if (!PreCollectRegister) {
-    unsigned NbReg = TRI->getNumRegs();
-    for (; CurRegId < NbReg; ++CurRegId) {
-      RegToId[CurRegId] = CurRegId;
-      DEBUG(IdToReg.push_back(CurRegId));
-      DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
-    }
+static void handleRegMaskClobber(const uint32_t *RegMask, MCPhysReg Reg,
+                                 LOHInfo *LOHInfos) {
+  if (!MachineOperand::clobbersPhysReg(RegMask, Reg))
     return;
-  }
-
-  DEBUG(dbgs() << "** Collect Involved Register\n");
-  for (const auto &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (!canDefBePartOfLOH(&MI) &&
-          !isCandidateLoad(&MI) && !isCandidateStore(&MI))
-        continue;
+  int Idx = mapRegToGPRIndex(Reg);
+  if (Idx >= 0)
+    handleClobber(LOHInfos[Idx]);
+}
 
-      // Process defs
-      for (MachineInstr::const_mop_iterator IO = MI.operands_begin(),
-                                            IOEnd = MI.operands_end();
-           IO != IOEnd; ++IO) {
-        if (!IO->isReg() || !IO->isDef())
-          continue;
-        unsigned CurReg = IO->getReg();
-        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
-          if (RegToId.find(*AI) == RegToId.end()) {
-            DEBUG(IdToReg.push_back(*AI);
-                  assert(IdToReg[CurRegId] == *AI &&
-                         "Reg index mismatches insertion index."));
-            RegToId[*AI] = CurRegId++;
-            DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
-          }
-      }
+static void handleNormalInst(const MachineInstr &MI, LOHInfo *LOHInfos) {
+  // Handle defs and regmasks.
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isRegMask()) {
+      const uint32_t *RegMask = MO.getRegMask();
+      for (MCPhysReg Reg : AArch64::GPR32RegClass)
+        handleRegMaskClobber(RegMask, Reg, LOHInfos);
+      for (MCPhysReg Reg : AArch64::GPR64RegClass)
+        handleRegMaskClobber(RegMask, Reg, LOHInfos);
+      continue;
     }
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    int Idx = mapRegToGPRIndex(MO.getReg());
+    if (Idx < 0)
+      continue;
+    handleClobber(LOHInfos[Idx]);
+  }
+  // Handle uses.
+  for (const MachineOperand &MO : MI.uses()) {
+    if (!MO.isReg() || !MO.readsReg())
+      continue;
+    int Idx = mapRegToGPRIndex(MO.getReg());
+    if (Idx < 0)
+      continue;
+    handleUse(MI, MO, LOHInfos[Idx]);
   }
 }
 
@@ -1035,74 +489,59 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
 
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
-
-  MapRegToId RegToId;
-  MapIdToReg IdToReg;
-  AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>();
-  assert(AArch64FI && "No MachineFunctionInfo for this function!");
-
-  DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
+  DEBUG(dbgs() << "********** AArch64 Collect LOH **********\n"
+               << "Looking in function " << MF.getName() << '\n');
 
-  collectInvolvedReg(MF, RegToId, IdToReg, TRI);
-  if (RegToId.empty())
-    return false;
+  LOHInfo LOHInfos[N_GPR_REGS];
+  AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+  for (const MachineBasicBlock &MBB : MF) {
+    // Reset register tracking state.
+    memset(LOHInfos, 0, sizeof(LOHInfos));
+    // Live-out registers are used.
+    for (const MachineBasicBlock *Succ : MBB.successors()) {
+      for (const auto &LI : Succ->liveins()) {
+        int RegIdx = mapRegToGPRIndex(LI.PhysReg);
+        if (RegIdx >= 0)
+          LOHInfos[RegIdx].OneUser = true;
+      }
+    }
 
-  MachineInstr *DummyOp = nullptr;
-  if (BasicBlockScopeOnly) {
-    const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-    // For local analysis, create a dummy operation to record uses that are not
-    // local.
-    DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
+    // Walk the basic block backwards and update the per register state machine
+    // in the process.
+    for (const MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
+      unsigned Opcode = MI.getOpcode();
+      switch (Opcode) {
+      case AArch64::ADDXri:
+      case AArch64::LDRXui:
+        if (canDefBePartOfLOH(MI)) {
+          const MachineOperand &Def = MI.getOperand(0);
+          const MachineOperand &Op = MI.getOperand(1);
+          assert(Def.isReg() && Def.isDef() && "Expected reg def");
+          assert(Op.isReg() && Op.isUse() && "Expected reg use");
+          int DefIdx = mapRegToGPRIndex(Def.getReg());
+          int OpIdx = mapRegToGPRIndex(Op.getReg());
+          if (DefIdx >= 0 && OpIdx >= 0 &&
+              handleMiddleInst(MI, LOHInfos[DefIdx], LOHInfos[OpIdx]))
+            continue;
+        }
+        break;
+      case AArch64::ADRP:
+        const MachineOperand &Op0 = MI.getOperand(0);
+        int Idx = mapRegToGPRIndex(Op0.getReg());
+        if (Idx >= 0) {
+          handleADRP(MI, AFI, LOHInfos[Idx]);
+          continue;
+        }
+        break;
+      }
+      handleNormalInst(MI, LOHInfos);
+    }
   }
 
-  unsigned NbReg = RegToId.size();
-  bool Modified = false;
-
-  // Start with ADRP.
-  InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
-
-  // Compute the reaching def in ADRP mode, meaning ADRP definitions
-  // are first considered as uses.
-  reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp);
-  DEBUG(dbgs() << "ADRP reaching defs\n");
-  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
-
-  // Translate the definition to uses map into a use to definitions map to ease
-  // statistic computation.
-  InstrToInstrs ADRPToReachingDefs;
-  reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
-
-  // Compute LOH for ADRP.
-  computeADRP(ADRPToReachingDefs, *AArch64FI, MDT);
-  delete[] ColorOpToReachedUses;
-
-  // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
-  ColorOpToReachedUses = new InstrToInstrs[NbReg];
-
-  // first perform a regular reaching def analysis.
-  reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp);
-  DEBUG(dbgs() << "All reaching defs\n");
-  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
-
-  // Turn that into a use to defs to ease statistic computation.
-  InstrToInstrs UsesToReachingDefs;
-  reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
-
-  // Compute other than AdrpAdrp LOH.
-  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId,
-                MDT);
-  delete[] ColorOpToReachedUses;
-
-  if (BasicBlockScopeOnly)
-    MF.DeleteMachineInstr(DummyOp);
-
-  return Modified;
+  // Return "no change": The pass only collects information.
+  return false;
 }
 
-/// createAArch64CollectLOHPass - returns an instance of the Statistic for
-/// linker optimization pass.
 FunctionPass *llvm::createAArch64CollectLOHPass() {
   return new AArch64CollectLOH();
 }
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4c98253878e..74a01835171 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11,28 +11,79 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64ISelLowering.h"
 #include "AArch64CallingConvention.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64ISelLowering.h"
 #include "AArch64PerfectShuffle.h"
+#include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
-#include "AArch64TargetMachine.h"
-#include "AArch64TargetObjectFile.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <bitset>
+#include <cassert>
+#include <cctype>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-lower"
@@ -59,7 +110,6 @@ static const MVT MVT_CC = MVT::i32;
 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                                              const AArch64Subtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
-
   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
   // we have to make something up. Arbitrarily, choose ZeroOrOne.
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -218,7 +268,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
-
   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 
@@ -3632,6 +3681,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
 
   llvm_unreachable("Unexpected platform trying to use TLS");
 }
+
 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
@@ -4549,7 +4599,6 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
   return DAG.getMergeValues(Ops, dl);
 }
 
-
 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
@@ -5074,10 +5123,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     int WindowBase;
     int WindowScale;
 
-    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
     ShuffleSourceInfo(SDValue Vec)
-        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
-          WindowScale(1) {}
+      : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
+          ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
+
+    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
   };
 
   // First gather all vectors used as an immediate source for this BUILD_VECTOR
@@ -7028,7 +7078,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     return true;
   }
   case Intrinsic::aarch64_ldaxp:
-  case Intrinsic::aarch64_ldxp: {
+  case Intrinsic::aarch64_ldxp:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(0);
@@ -7038,9 +7088,8 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.readMem = true;
     Info.writeMem = false;
     return true;
-  }
   case Intrinsic::aarch64_stlxp:
-  case Intrinsic::aarch64_stxp: {
+  case Intrinsic::aarch64_stxp:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(2);
@@ -7050,7 +7099,6 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.readMem = false;
     Info.writeMem = true;
     return true;
-  }
   default:
     break;
   }
@@ -8044,13 +8092,13 @@ static SDValue tryCombineToEXTR(SDNode *N,
 
   SDValue LHS;
   uint32_t ShiftLHS = 0;
-  bool LHSFromHi = 0;
+  bool LHSFromHi = false;
   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
     return SDValue();
 
   SDValue RHS;
   uint32_t ShiftRHS = 0;
-  bool RHSFromHi = 0;
+  bool RHSFromHi = false;
   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
     return SDValue();
 
@@ -9732,52 +9780,51 @@ static bool isEquivalentMaskless(unsigned CC, unsigned width,
 
   switch(CC) {
   case AArch64CC::LE:
-  case AArch64CC::GT: {
+  case AArch64CC::GT:
     if ((AddConstant == 0) ||
         (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
         (AddConstant >= 0 && CompConstant < 0) ||
         (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
       return true;
-  } break;
+    break;
   case AArch64CC::LT:
-  case AArch64CC::GE: {
+  case AArch64CC::GE:
     if ((AddConstant == 0) ||
         (AddConstant >= 0 && CompConstant <= 0) ||
         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
       return true;
-  } break;
+    break;
   case AArch64CC::HI:
-  case AArch64CC::LS: {
+  case AArch64CC::LS:
     if ((AddConstant >= 0 && CompConstant < 0) ||
        (AddConstant <= 0 && CompConstant >= -1 &&
         CompConstant < AddConstant + MaxUInt))
       return true;
-  } break;
+   break;
   case AArch64CC::PL:
-  case AArch64CC::MI: {
+  case AArch64CC::MI:
     if ((AddConstant == 0) ||
         (AddConstant > 0 && CompConstant <= 0) ||
         (AddConstant < 0 && CompConstant <= AddConstant))
       return true;
-  } break;
+    break;
   case AArch64CC::LO:
-  case AArch64CC::HS: {
+  case AArch64CC::HS:
     if ((AddConstant >= 0 && CompConstant <= 0) ||
         (AddConstant <= 0 && CompConstant >= 0 &&
          CompConstant <= AddConstant + MaxUInt))
       return true;
-  } break;
+    break;
   case AArch64CC::EQ:
-  case AArch64CC::NE: {
+  case AArch64CC::NE:
     if ((AddConstant > 0 && CompConstant < 0) ||
         (AddConstant < 0 && CompConstant >= 0 &&
          CompConstant < AddConstant + MaxUInt) ||
         (AddConstant >= 0 && CompConstant >= 0 &&
          CompConstant >= AddConstant) ||
         (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
-
       return true;
-  } break;
+    break;
   case AArch64CC::VS:
   case AArch64CC::VC:
   case AArch64CC::AL:
@@ -10501,7 +10548,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
   if (ValTy->getPrimitiveSizeInBits() == 128) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
-    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+    Function *Ldxr = Intrinsic::getDeclaration(M, Int);
 
     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
@@ -10517,7 +10564,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int =
       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
-  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+  Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
 
   return Builder.CreateTruncOrBitCast(
       Builder.CreateCall(Ldxr, Addr),
@@ -10527,8 +10574,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilder<> &Builder) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Builder.CreateCall(
-      llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
+  Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
 }
 
 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 626c934f236..5c8acba26aa 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -14,16 +14,37 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-#include <algorithm>
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
 
 using namespace llvm;
 
@@ -529,19 +550,19 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
     default:
       llvm_unreachable("Unknown branch opcode in Cond");
     case AArch64::CBZW:
-      Is64Bit = 0;
+      Is64Bit = false;
       CC = AArch64CC::EQ;
       break;
     case AArch64::CBZX:
-      Is64Bit = 1;
+      Is64Bit = true;
       CC = AArch64CC::EQ;
       break;
     case AArch64::CBNZW:
-      Is64Bit = 0;
+      Is64Bit = false;
       CC = AArch64CC::NE;
       break;
     case AArch64::CBNZX:
-      Is64Bit = 1;
+      Is64Bit = true;
       CC = AArch64CC::NE;
       break;
     }
@@ -1044,7 +1065,7 @@ static unsigned sForm(MachineInstr &Instr) {
   case AArch64::SUBSWri:
   case AArch64::SUBSXrr:
   case AArch64::SUBSXri:
-    return Instr.getOpcode();;
+    return Instr.getOpcode();
 
   case AArch64::ADDWrr:    return AArch64::ADDSWrr;
   case AArch64::ADDWri:    return AArch64::ADDSWri;
@@ -1072,12 +1093,15 @@ static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
 }
 
 namespace {
+
 struct UsedNZCV {
-  bool N;
-  bool Z;
-  bool C;
-  bool V;
-  UsedNZCV(): N(false), Z(false), C(false), V(false) {}
+  bool N = false;
+  bool Z = false;
+  bool C = false;
+  bool V = false;
+
+  UsedNZCV() = default;
+
   UsedNZCV& operator |=(const UsedNZCV& UsedFlags) {
     this->N |= UsedFlags.N;
     this->Z |= UsedFlags.Z;
@@ -1086,6 +1110,7 @@ struct UsedNZCV {
     return *this;
   }
 };
+
 } // end anonymous namespace
 
 /// Find a condition code used by the instruction.
@@ -1561,7 +1586,7 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
 
 /// Check all MachineMemOperands for a hint to suppress pairing.
 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const {
-  return any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
     return MMO->getFlags() & MOSuppressPair;
   });
 }
@@ -1994,7 +2019,7 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
 void AArch64InstrInfo::copyPhysRegTuple(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
     unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
-    llvm::ArrayRef<unsigned> Indices) const {
+    ArrayRef<unsigned> Indices) const {
   assert(Subtarget.hasNEON() &&
          "Unexpected register copy without NEON");
   const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -2583,7 +2608,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   //
   // <rdar://problem/11522048>
   //
-  if (MI.isCopy()) {
+  if (MI.isFullCopy()) {
     unsigned DstReg = MI.getOperand(0).getReg();
     unsigned SrcReg = MI.getOperand(1).getReg();
     if (SrcReg == AArch64::SP &&
@@ -2598,7 +2623,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     }
   }
 
-  // Handle the case where a copy is being spilled or refilled but the source
+  // Handle the case where a copy is being spilled or filled but the source
   // and destination register class don't match.  For example:
   //
   //   %vreg0<def> = COPY %XZR; GPR64common:%vreg0
@@ -2613,7 +2638,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   //
   //   %vreg0<def> = COPY %vreg1; GPR64:%vreg0, FPR64:%vreg1
   //
-  // will be refilled as
+  // will be filled as
   //
   //   LDRDui %vreg0, fi<#0>
   //
@@ -2622,9 +2647,11 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   //   LDRXui %vregTemp, fi<#0>
   //   %vreg0 = FMOV %vregTemp
   //
-  if (MI.isFullCopy() && Ops.size() == 1 &&
+  if (MI.isCopy() && Ops.size() == 1 &&
       // Make sure we're only folding the explicit COPY defs/uses.
       (Ops[0] == 0 || Ops[0] == 1)) {
+    bool IsSpill = Ops[0] == 0;
+    bool IsFill = !IsSpill;
     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
     const MachineRegisterInfo &MRI = MF.getRegInfo();
     MachineBasicBlock &MBB = *MI.getParent();
@@ -2632,21 +2659,112 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     const MachineOperand &SrcMO = MI.getOperand(1);
     unsigned DstReg = DstMO.getReg();
     unsigned SrcReg = SrcMO.getReg();
+    // This is slightly expensive to compute for physical regs since
+    // getMinimalPhysRegClass is slow.
     auto getRegClass = [&](unsigned Reg) {
       return TargetRegisterInfo::isVirtualRegister(Reg)
                  ? MRI.getRegClass(Reg)
                  : TRI.getMinimalPhysRegClass(Reg);
     };
-    const TargetRegisterClass &DstRC = *getRegClass(DstReg);
-    const TargetRegisterClass &SrcRC = *getRegClass(SrcReg);
-    if (DstRC.getSize() == SrcRC.getSize()) {
-      if (Ops[0] == 0)
+
+    if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
+      assert(getRegClass(DstReg)->getSize() == getRegClass(SrcReg)->getSize() &&
+             "Mismatched register size in non subreg COPY");
+      if (IsSpill)
         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
-                            &SrcRC, &TRI);
+                            getRegClass(SrcReg), &TRI);
       else
-        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, &DstRC, &TRI);
+        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
+                             getRegClass(DstReg), &TRI);
       return &*--InsertPt;
     }
+
+    // Handle cases like spilling def of:
+    //
+    //   %vreg0:sub_32<def,read-undef> = COPY %WZR; GPR64common:%vreg0
+    //
+    // where the physical register source can be widened and stored to the full
+    // virtual reg destination stack slot, in this case producing:
+    //
+    //   STRXui %XZR, <fi#0>
+    //
+    if (IsSpill && DstMO.isUndef() &&
+        TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+      assert(SrcMO.getSubReg() == 0 &&
+             "Unexpected subreg on physical register");
+      const TargetRegisterClass *SpillRC;
+      unsigned SpillSubreg;
+      switch (DstMO.getSubReg()) {
+      default:
+        SpillRC = nullptr;
+        break;
+      case AArch64::sub_32:
+      case AArch64::ssub:
+        if (AArch64::GPR32RegClass.contains(SrcReg)) {
+          SpillRC = &AArch64::GPR64RegClass;
+          SpillSubreg = AArch64::sub_32;
+        } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
+          SpillRC = &AArch64::FPR64RegClass;
+          SpillSubreg = AArch64::ssub;
+        } else
+          SpillRC = nullptr;
+        break;
+      case AArch64::dsub:
+        if (AArch64::FPR64RegClass.contains(SrcReg)) {
+          SpillRC = &AArch64::FPR128RegClass;
+          SpillSubreg = AArch64::dsub;
+        } else
+          SpillRC = nullptr;
+        break;
+      }
+
+      if (SpillRC)
+        if (unsigned WidenedSrcReg =
+                TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
+          storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
+                              FrameIndex, SpillRC, &TRI);
+          return &*--InsertPt;
+        }
+    }
+
+    // Handle cases like filling use of:
+    //
+    //   %vreg0:sub_32<def,read-undef> = COPY %vreg1; GPR64:%vreg0, GPR32:%vreg1
+    //
+    // where we can load the full virtual reg source stack slot, into the subreg
+    // destination, in this case producing:
+    //
+    //   LDRWui %vreg0:sub_32<def,read-undef>, <fi#0>
+    //
+    if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
+      const TargetRegisterClass *FillRC;
+      switch (DstMO.getSubReg()) {
+      default:
+        FillRC = nullptr;
+        break;
+      case AArch64::sub_32:
+        FillRC = &AArch64::GPR32RegClass;
+        break;
+      case AArch64::ssub:
+        FillRC = &AArch64::FPR32RegClass;
+        break;
+      case AArch64::dsub:
+        FillRC = &AArch64::FPR64RegClass;
+        break;
+      }
+
+      if (FillRC) {
+        assert(getRegClass(SrcReg)->getSize() == FillRC->getSize() &&
+               "Mismatched regclass size on folded subreg COPY");
+        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
+        MachineInstr &LoadMI = *--InsertPt;
+        MachineOperand &LoadDst = LoadMI.getOperand(0);
+        assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
+        LoadDst.setSubReg(DstMO.getSubReg());
+        LoadDst.setIsUndef();
+        return &LoadMI;
+      }
+    }
   }
 
   // Cannot fold.
@@ -2936,7 +3054,7 @@ bool AArch64InstrInfo::useMachineCombiner() const {
 
   return true;
 }
-//
+
 // True when Opc sets flag
 static bool isCombineInstrSettingFlag(unsigned Opc) {
   switch (Opc) {
@@ -2955,7 +3073,7 @@ static bool isCombineInstrSettingFlag(unsigned Opc) {
   }
   return false;
 }
-//
+
 // 32b Opcodes that can be combined with a MUL
 static bool isCombineInstrCandidate32(unsigned Opc) {
   switch (Opc) {
@@ -2974,7 +3092,7 @@ static bool isCombineInstrCandidate32(unsigned Opc) {
   }
   return false;
 }
-//
+
 // 64b Opcodes that can be combined with a MUL
 static bool isCombineInstrCandidate64(unsigned Opc) {
   switch (Opc) {
@@ -2993,7 +3111,7 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
   }
   return false;
 }
-//
+
 // FP Opcodes that can be combined with a FMUL
 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
   switch (Inst.getOpcode()) {
@@ -3009,13 +3127,13 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
   case AArch64::FSUBv2f32:
   case AArch64::FSUBv2f64:
   case AArch64::FSUBv4f32:
-		TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 
-    return (Options.UnsafeFPMath || 
-				    Options.AllowFPOpFusion == FPOpFusion::Fast);
+    TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
+    return (Options.UnsafeFPMath ||
+            Options.AllowFPOpFusion == FPOpFusion::Fast);
   }
   return false;
 }
-//
+
 // Opcodes that can be combined with a MUL
 static bool isCombineInstrCandidate(unsigned Opc) {
   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
@@ -3205,7 +3323,7 @@ static bool getFMAPatterns(MachineInstr &Root,
                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
 
   if (!isCombineInstrCandidateFP(Root))
-    return 0;
+    return false;
 
   MachineBasicBlock &MBB = *Root.getParent();
   bool Found = false;
@@ -3971,8 +4089,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
   // Record MUL and ADD/SUB for deletion
   DelInstrs.push_back(MUL);
   DelInstrs.push_back(&Root);
-
-  return;
 }
 
 /// \brief Replace csincr-branch sequence by simple conditional branch
@@ -4148,6 +4264,7 @@ AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
 ArrayRef<std::pair<unsigned, const char *>>
 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
   using namespace AArch64II;
+
   static const std::pair<unsigned, const char *> TargetFlags[] = {
       {MO_PAGE, "aarch64-page"},
       {MO_PAGEOFF, "aarch64-pageoff"},
@@ -4162,6 +4279,7 @@ AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
 ArrayRef<std::pair<unsigned, const char *>>
 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   using namespace AArch64II;
+
   static const std::pair<unsigned, const char *> TargetFlags[] = {
       {MO_GOT, "aarch64-got"},
       {MO_NC, "aarch64-nc"},
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 90b2c089687..5037866925d 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -162,6 +162,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+  // This tells target independent code that it is okay to pass instructions
+  // with subreg operands to foldMemoryOperandImpl.
+  bool isSubregFoldable() const override { return true; }
+
   using TargetInstrInfo::foldMemoryOperandImpl;
   MachineInstr *
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 20de07424c5..b51473524c7 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -1071,8 +1071,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
       return false;
     }
 
-    const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
-        (CmpInst::Predicate)I.getOperand(1).getPredicate());
+    // CSINC increments the result by one when the condition code is false.
+    // Therefore, we have to invert the predicate to get an increment by 1 when
+    // the predicate is true.
+    const AArch64CC::CondCode invCC =
+        changeICMPPredToAArch64CC(CmpInst::getInversePredicate(
+            (CmpInst::Predicate)I.getOperand(1).getPredicate()));
 
     MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
                                .addDef(ZReg)
@@ -1084,7 +1088,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
              .addDef(I.getOperand(0).getReg())
              .addUse(AArch64::WZR)
              .addUse(AArch64::WZR)
-             .addImm(CC);
+             .addImm(invCC);
 
     constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
     constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.h b/lib/Target/AArch64/AArch64InstructionSelector.h
index 0d44e696ac2..2c6e5a912fb 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.h
+++ b/lib/Target/AArch64/AArch64InstructionSelector.h
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 
 namespace llvm {
+
 class AArch64InstrInfo;
 class AArch64RegisterBankInfo;
 class AArch64RegisterInfo;
@@ -29,7 +30,7 @@ class AArch64InstructionSelector : public InstructionSelector {
                              const AArch64Subtarget &STI,
                              const AArch64RegisterBankInfo &RBI);
 
-  virtual bool select(MachineInstr &I) const override;
+  bool select(MachineInstr &I) const override;
 
 private:
   /// tblgen-erated 'select' implementation, used as the initial selector for
@@ -43,5 +44,6 @@ class AArch64InstructionSelector : public InstructionSelector {
   const AArch64RegisterBankInfo &RBI;
 };
 
-} // End llvm namespace.
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index ca2860afe13..f0bffe54415 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -14,17 +14,18 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
+#include <cassert>
 
 namespace llvm {
 
 /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
 /// contains private AArch64-specific information for each MachineFunction.
 class AArch64FunctionInfo final : public MachineFunctionInfo {
-
   /// Number of bytes of arguments this function has on the stack. If the callee
   /// is expected to restore the argument stack this should be a multiple of 16,
   /// all usable during a tail call.
@@ -34,16 +35,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// space to a function with 16-bytes then misalignment of this value would
   /// make a stack adjustment necessary, which could not be undone by the
   /// callee.
-  unsigned BytesInStackArgArea;
+  unsigned BytesInStackArgArea = 0;
 
   /// The number of bytes to restore to deallocate space for incoming
   /// arguments. Canonically 0 in the C calling convention, but non-zero when
   /// callee is expected to pop the args.
-  unsigned ArgumentStackToRestore;
+  unsigned ArgumentStackToRestore = 0;
 
   /// HasStackFrame - True if this function has a stack frame. Set by
   /// determineCalleeSaves().
-  bool HasStackFrame;
+  bool HasStackFrame = false;
 
   /// \brief Amount of stack frame size, not including callee-saved registers.
   unsigned LocalStackSize;
@@ -53,54 +54,44 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
 
   /// \brief Number of TLS accesses using the special (combinable)
   /// _TLS_MODULE_BASE_ symbol.
-  unsigned NumLocalDynamicTLSAccesses;
+  unsigned NumLocalDynamicTLSAccesses = 0;
 
   /// \brief FrameIndex for start of varargs area for arguments passed on the
   /// stack.
-  int VarArgsStackIndex;
+  int VarArgsStackIndex = 0;
 
   /// \brief FrameIndex for start of varargs area for arguments passed in
   /// general purpose registers.
-  int VarArgsGPRIndex;
+  int VarArgsGPRIndex = 0;
 
   /// \brief Size of the varargs area for arguments passed in general purpose
   /// registers.
-  unsigned VarArgsGPRSize;
+  unsigned VarArgsGPRSize = 0;
 
   /// \brief FrameIndex for start of varargs area for arguments passed in
   /// floating-point registers.
-  int VarArgsFPRIndex;
+  int VarArgsFPRIndex = 0;
 
   /// \brief Size of the varargs area for arguments passed in floating-point
   /// registers.
-  unsigned VarArgsFPRSize;
+  unsigned VarArgsFPRSize = 0;
 
   /// True if this function has a subset of CSRs that is handled explicitly via
   /// copies.
-  bool IsSplitCSR;
+  bool IsSplitCSR = false;
 
   /// True when the stack gets realigned dynamically because the size of stack
   /// frame is unknown at compile time. e.g., in case of VLAs.
-  bool StackRealigned;
+  bool StackRealigned = false;
 
   /// True when the callee-save stack area has unused gaps that may be used for
   /// other stack allocations.
-  bool CalleeSaveStackHasFreeSpace;
+  bool CalleeSaveStackHasFreeSpace = false;
 
 public:
-  AArch64FunctionInfo()
-      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
-        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
-        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
-        IsSplitCSR(false), StackRealigned(false),
-        CalleeSaveStackHasFreeSpace(false) {}
-
-  explicit AArch64FunctionInfo(MachineFunction &MF)
-      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
-        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
-        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
-        IsSplitCSR(false), StackRealigned(false),
-        CalleeSaveStackHasFreeSpace(false) {
+  AArch64FunctionInfo() = default;
+
+  explicit AArch64FunctionInfo(MachineFunction &MF) {
     (void)MF;
   }
 
@@ -193,6 +184,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   MILOHContainer LOHContainerSet;
   SetOfInstructions LOHRelated;
 };
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index f58bbbd2613..03e01329e03 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -71,6 +71,7 @@ void AArch64Subtarget::initializeProperties() {
     break;
   case Falkor:
     MaxInterleaveFactor = 4;
+    VectorInsertExtractBaseCost = 2;
     break;
   case Kryo:
     MaxInterleaveFactor = 4;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index e4ef0d4bb8d..d2883941e2c 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -15,24 +15,35 @@
 #include "AArch64InstructionSelector.h"
 #include "AArch64LegalizerInfo.h"
 #include "AArch64RegisterBankInfo.h"
+#include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
 #include "AArch64TargetTransformInfo.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
+#include <memory>
+#include <string>
+
 using namespace llvm;
 
 static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp",
@@ -154,9 +165,9 @@ extern "C" void LLVMInitializeAArch64Target() {
 //===----------------------------------------------------------------------===//
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO())
-    return make_unique<AArch64_MachoTargetObjectFile>();
+    return llvm::make_unique<AArch64_MachoTargetObjectFile>();
 
-  return make_unique<AArch64_ELFTargetObjectFile>();
+  return llvm::make_unique<AArch64_ELFTargetObjectFile>();
 }
 
 // Helper function to build a DataLayout string
@@ -202,29 +213,35 @@ AArch64TargetMachine::AArch64TargetMachine(
   initAsmInfo();
 }
 
-AArch64TargetMachine::~AArch64TargetMachine() {}
+AArch64TargetMachine::~AArch64TargetMachine() = default;
 
 #ifdef LLVM_BUILD_GLOBAL_ISEL
 namespace {
+
 struct AArch64GISelActualAccessor : public GISelAccessor {
   std::unique_ptr<CallLowering> CallLoweringInfo;
   std::unique_ptr<InstructionSelector> InstSelector;
   std::unique_ptr<LegalizerInfo> Legalizer;
   std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
   const CallLowering *getCallLowering() const override {
     return CallLoweringInfo.get();
   }
+
   const InstructionSelector *getInstructionSelector() const override {
     return InstSelector.get();
   }
+
   const LegalizerInfo *getLegalizerInfo() const override {
     return Legalizer.get();
   }
+
   const RegisterBankInfo *getRegBankInfo() const override {
     return RegBankInfo.get();
   }
 };
-} // End anonymous namespace.
+
+} // end anonymous namespace
 #endif
 
 const AArch64Subtarget *
@@ -287,6 +304,7 @@ AArch64beTargetMachine::AArch64beTargetMachine(
     : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 namespace {
+
 /// AArch64 Code Generator Pass Configuration Options.
 class AArch64PassConfig : public TargetPassConfig {
 public:
@@ -324,7 +342,8 @@ class AArch64PassConfig : public TargetPassConfig {
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
-} // namespace
+
+} // end anonymous namespace
 
 TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
   return TargetIRAnalysis([this](const Function &F) {
@@ -414,14 +433,17 @@ bool AArch64PassConfig::addIRTranslator() {
   addPass(new IRTranslator());
   return false;
 }
+
 bool AArch64PassConfig::addLegalizeMachineIR() {
   addPass(new Legalizer());
   return false;
 }
+
 bool AArch64PassConfig::addRegBankSelect() {
   addPass(new RegBankSelect());
   return false;
 }
+
 bool AArch64PassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect());
   return false;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 88c98865bbc..1a17691fc58 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -417,14 +417,17 @@ int AArch64TTIImpl::getArithmeticInstrCost(
   }
 }
 
-int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+                                              const SCEV *Ptr) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
   // extra micro-ops can significantly decrease throughput.
   unsigned NumVectorInstToHideOverhead = 10;
+  int MaxMergeDistance = 64;
 
-  if (Ty->isVectorTy() && IsComplex)
+  if (Ty->isVectorTy() && SE && 
+      !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
     return NumVectorInstToHideOverhead;
 
   // In many cases the address computation is not merged into the instruction
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 24642cb1698..849fd3d9b44 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -104,7 +104,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
 
-  int getAddressComputationCost(Type *Ty, bool IsComplex);
+  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index db84afacf30..b86a283b40d 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -9,45 +9,62 @@
 
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64TargetStreamer.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cctype>
+#include <cstdint>
 #include <cstdio>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 namespace {
 
-class AArch64Operand;
-
 class AArch64AsmParser : public MCTargetAsmParser {
 private:
   StringRef Mnemonic; ///< Instruction mnemonic.
 
   // Map of register aliases registers via the .req directive.
-  StringMap<std::pair<bool, unsigned> > RegisterReqs;
+  StringMap<std::pair<bool, unsigned>> RegisterReqs;
 
   AArch64TargetStreamer &getTargetStreamer() {
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
@@ -118,6 +135,7 @@ class AArch64AsmParser : public MCTargetAsmParser {
 #include "AArch64GenAsmMatcher.inc"
   };
   bool IsILP32;
+
   AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                    const MCInstrInfo &MII, const MCTargetOptions &Options)
     : MCTargetAsmParser(Options, STI) {
@@ -143,9 +161,6 @@ class AArch64AsmParser : public MCTargetAsmParser {
                                 MCSymbolRefExpr::VariantKind &DarwinRefKind,
                                 int64_t &Addend);
 };
-} // end anonymous namespace
-
-namespace {
 
 /// AArch64Operand - Instances of this class represent a parsed AArch64 machine
 /// instruction.
@@ -531,6 +546,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 2);
   }
+
   bool isImm0_7() const {
     if (!isImm())
       return false;
@@ -540,6 +556,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 8);
   }
+
   bool isImm1_8() const {
     if (!isImm())
       return false;
@@ -549,6 +566,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val > 0 && Val < 9);
   }
+
   bool isImm0_15() const {
     if (!isImm())
       return false;
@@ -558,6 +576,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 16);
   }
+
   bool isImm1_16() const {
     if (!isImm())
       return false;
@@ -567,6 +586,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val > 0 && Val < 17);
   }
+
   bool isImm0_31() const {
     if (!isImm())
       return false;
@@ -576,6 +596,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 32);
   }
+
   bool isImm1_31() const {
     if (!isImm())
       return false;
@@ -585,6 +606,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 1 && Val < 32);
   }
+
   bool isImm1_32() const {
     if (!isImm())
       return false;
@@ -594,6 +616,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 1 && Val < 33);
   }
+
   bool isImm0_63() const {
     if (!isImm())
       return false;
@@ -603,6 +626,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 64);
   }
+
   bool isImm1_63() const {
     if (!isImm())
       return false;
@@ -612,6 +636,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 1 && Val < 64);
   }
+
   bool isImm1_64() const {
     if (!isImm())
       return false;
@@ -621,6 +646,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 1 && Val < 65);
   }
+
   bool isImm0_127() const {
     if (!isImm())
       return false;
@@ -630,6 +656,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 128);
   }
+
   bool isImm0_255() const {
     if (!isImm())
       return false;
@@ -639,6 +666,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 256);
   }
+
   bool isImm0_65535() const {
     if (!isImm())
       return false;
@@ -648,6 +676,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 0 && Val < 65536);
   }
+
   bool isImm32_63() const {
     if (!isImm())
       return false;
@@ -657,6 +686,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = MCE->getValue();
     return (Val >= 32 && Val < 64);
   }
+
   bool isLogicalImm32() const {
     if (!isImm())
       return false;
@@ -669,6 +699,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     Val &= 0xFFFFFFFF;
     return AArch64_AM::isLogicalImmediate(Val, 32);
   }
+
   bool isLogicalImm64() const {
     if (!isImm())
       return false;
@@ -677,6 +708,7 @@ class AArch64Operand : public MCParsedAsmOperand {
       return false;
     return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
   }
+
   bool isLogicalImm32Not() const {
     if (!isImm())
       return false;
@@ -686,6 +718,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
     return AArch64_AM::isLogicalImmediate(Val, 32);
   }
+
   bool isLogicalImm64Not() const {
     if (!isImm())
       return false;
@@ -694,7 +727,9 @@ class AArch64Operand : public MCParsedAsmOperand {
       return false;
     return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64);
   }
+
   bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+
   bool isAddSubImm() const {
     if (!isShiftedImm() && !isImm())
       return false;
@@ -737,6 +772,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     // code deal with it.
     return true;
   }
+
   bool isAddSubImmNeg() const {
     if (!isShiftedImm() && !isImm())
       return false;
@@ -756,7 +792,9 @@ class AArch64Operand : public MCParsedAsmOperand {
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
     return CE != nullptr && CE->getValue() < 0 && -CE->getValue() <= 0xfff;
   }
+
   bool isCondCode() const { return Kind == k_CondCode; }
+
   bool isSIMDImmType10() const {
     if (!isImm())
       return false;
@@ -765,6 +803,7 @@ class AArch64Operand : public MCParsedAsmOperand {
       return false;
     return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
   }
+
   bool isBranchTarget26() const {
     if (!isImm())
       return false;
@@ -776,6 +815,7 @@ class AArch64Operand : public MCParsedAsmOperand {
       return false;
     return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
   }
+
   bool isPCRelLabel19() const {
     if (!isImm())
       return false;
@@ -787,6 +827,7 @@ class AArch64Operand : public MCParsedAsmOperand {
       return false;
     return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
   }
+
   bool isBranchTarget14() const {
     if (!isImm())
       return false;
@@ -891,40 +932,49 @@ class AArch64Operand : public MCParsedAsmOperand {
   bool isFPImm() const { return Kind == k_FPImm; }
   bool isBarrier() const { return Kind == k_Barrier; }
   bool isSysReg() const { return Kind == k_SysReg; }
+
   bool isMRSSystemRegister() const {
     if (!isSysReg()) return false;
 
     return SysReg.MRSReg != -1U;
   }
+
   bool isMSRSystemRegister() const {
     if (!isSysReg()) return false;
     return SysReg.MSRReg != -1U;
   }
+
   bool isSystemPStateFieldWithImm0_1() const {
     if (!isSysReg()) return false;
     return (SysReg.PStateField == AArch64PState::PAN ||
             SysReg.PStateField == AArch64PState::UAO);
   }
+
   bool isSystemPStateFieldWithImm0_15() const {
     if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false;
     return SysReg.PStateField != -1U;
   }
+
   bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
   bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+
   bool isVectorRegLo() const {
     return Kind == k_Register && Reg.isVector &&
            AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
                Reg.RegNum);
   }
+
   bool isGPR32as64() const {
     return Kind == k_Register && !Reg.isVector &&
       AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
   }
+
   bool isWSeqPair() const {
     return Kind == k_Register && !Reg.isVector &&
            AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
                Reg.RegNum);
   }
+
   bool isXSeqPair() const {
     return Kind == k_Register && !Reg.isVector &&
            AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains(
@@ -957,19 +1007,25 @@ class AArch64Operand : public MCParsedAsmOperand {
   bool isVectorIndex1() const {
     return Kind == k_VectorIndex && VectorIndex.Val == 1;
   }
+
   bool isVectorIndexB() const {
     return Kind == k_VectorIndex && VectorIndex.Val < 16;
   }
+
   bool isVectorIndexH() const {
     return Kind == k_VectorIndex && VectorIndex.Val < 8;
   }
+
   bool isVectorIndexS() const {
     return Kind == k_VectorIndex && VectorIndex.Val < 4;
   }
+
   bool isVectorIndexD() const {
     return Kind == k_VectorIndex && VectorIndex.Val < 2;
   }
+
   bool isToken() const override { return Kind == k_Token; }
+
   bool isTokenEqual(StringRef Str) const {
     return Kind == k_Token && getToken() == Str;
   }
@@ -1006,6 +1062,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     AArch64_AM::ShiftExtendType ET = getShiftExtendType();
     return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
   }
+
   bool isExtendLSL64() const {
     if (!isExtend())
       return false;
@@ -1836,11 +1893,10 @@ void AArch64Operand::print(raw_ostream &OS) const {
       OS << "<prfop invalid #" << getPrefetch() << ">";
     break;
   }
-  case k_PSBHint: {
+  case k_PSBHint:
     OS << getPSBHintName();
     break;
-  }
-  case k_ShiftExtend: {
+  case k_ShiftExtend:
     OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
        << getShiftExtendAmount();
     if (!hasShiftExtendAmount())
@@ -1848,7 +1904,6 @@ void AArch64Operand::print(raw_ostream &OS) const {
     OS << '>';
     break;
   }
-  }
 }
 
 /// @name Auto-generated Match Functions
@@ -2469,7 +2524,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     Expr = MCConstantExpr::create(op2, getContext());                          \
     Operands.push_back(                                                        \
         AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
-  } while (0)
+  } while (false)
 
   if (Mnemonic == "ic") {
     if (!Op.compare_lower("ialluis")) {
@@ -3979,7 +4034,6 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     }
   }
 
-
   switch (MatchResult) {
   case Match_Success: {
     // Perform range checking and other semantic validations
@@ -4550,7 +4604,6 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   return Match_InvalidOperand;
 }
 
-
 OperandMatchResultTy
 AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
 
@@ -4601,7 +4654,7 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
- if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
+  if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
       (isXReg && !XRegClass.contains(SecondReg)) ||
       (isWReg && !WRegClass.contains(SecondReg))) {
     Error(E,"expected second odd register of a "
@@ -4610,7 +4663,7 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
   }
 
   unsigned Pair = 0;
-  if(isXReg) {
+  if (isXReg) {
     Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube64,
            &AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID]);
   } else {
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 24e353cf4b9..bc2f7f18169 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -17,15 +17,12 @@
 
 namespace llvm {
 
-class MCInst;
-class raw_ostream;
-
 class AArch64Disassembler : public MCDisassembler {
 public:
   AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
     : MCDisassembler(STI, Ctx) {}
 
-  ~AArch64Disassembler() {}
+  ~AArch64Disassembler() override = default;
 
   MCDisassembler::DecodeStatus
   getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
@@ -33,6 +30,6 @@ class AArch64Disassembler : public MCDisassembler {
                  raw_ostream &CStream) const override;
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index a1edb3cef46..c954c0eb2c6 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -17,25 +17,30 @@
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
+
 class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian, bool IsILP32);
 
-  ~AArch64ELFObjectWriter() override;
+  ~AArch64ELFObjectWriter() override = default;
 
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
   bool IsILP32;
-private:
 };
-}
+
+} // end anonymous namespace
 
 AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
                                                bool IsLittleEndian,
@@ -44,8 +49,6 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
                               /*HasRelocationAddend*/ true),
       IsILP32(IsILP32) {}
 
-AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
-
 #define R_CLS(rtype) \
         IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
 #define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index f7058cdf237..62dfa59483e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -15,15 +15,23 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
@@ -37,13 +45,12 @@ class AArch64MCCodeEmitter : public MCCodeEmitter {
   MCContext &Ctx;
   const MCInstrInfo &MCII;
 
-  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const AArch64MCCodeEmitter &);     // DO NOT IMPLEMENT
 public:
   AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
       : Ctx(ctx), MCII(mcii) {}
-
-  ~AArch64MCCodeEmitter() override {}
+  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) = delete;
+  void operator=(const AArch64MCCodeEmitter &) = delete;
+  ~AArch64MCCodeEmitter() override = default;
 
   // getBinaryCodeForInstr - TableGen'erated function for getting the
   // binary encoding for an instruction.
@@ -181,12 +188,6 @@ class AArch64MCCodeEmitter : public MCCodeEmitter {
 
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
-                                                const MCRegisterInfo &MRI,
-                                                MCContext &Ctx) {
-  return new AArch64MCCodeEmitter(MCII, Ctx);
-}
-
 /// getMachineOpValue - Return binary encoding of operand. If the machine
 /// operand requires relocation, record the relocation and return zero.
 unsigned
@@ -601,3 +602,9 @@ unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "AArch64GenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                MCContext &Ctx) {
+  return new AArch64MCCodeEmitter(MCII, Ctx);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 3e86a42d5be..1b949b54590 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -13,6 +13,7 @@
 
 #include "AArch64TargetStreamer.h"
 #include "llvm/MC/ConstantPools.h"
+
 using namespace llvm;
 
 //
@@ -21,7 +22,7 @@ using namespace llvm;
 AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
 
-AArch64TargetStreamer::~AArch64TargetStreamer() {}
+AArch64TargetStreamer::~AArch64TargetStreamer() = default;
 
 // The constant pool handling is shared by all AArch64TargetStreamer
 // implementations.
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index a8e6902c252..4acd55eb612 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -176,12 +176,14 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   SetupMachineFunction(MF);
 
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
   MCContext &Context = getObjFileLowering().getContext();
-  MCSectionELF *ConfigSection =
-      Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
-  OutStreamer->SwitchSection(ConfigSection);
+  if (!STM.isAmdHsaOS()) {
+    MCSectionELF *ConfigSection =
+        Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+    OutStreamer->SwitchSection(ConfigSection);
+  }
 
-  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     getSIProgramInfo(KernelInfo, MF);
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 85cbadf0a57..5f651d4da5d 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -269,7 +269,7 @@ unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) {
 
 unsigned encodeWaitcnt(IsaVersion Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
-  unsigned Waitcnt = getWaitcntBitMask(Version);;
+  unsigned Waitcnt = getWaitcntBitMask(Version);
   Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
   Waitcnt = encodeExpcnt(Version, Waitcnt, Expcnt);
   Waitcnt = encodeLgkmcnt(Version, Waitcnt, Lgkmcnt);
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 10e6297ef1e..cc001b59678 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -338,14 +338,17 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+                                          const SCEV *Ptr) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
   // extra micro-ops can significantly decrease throughput.
   unsigned NumVectorInstToHideOverhead = 10;
+  int MaxMergeDistance = 64;
 
-  if (Ty->isVectorTy() && IsComplex)
+  if (Ty->isVectorTy() && SE && 
+      !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
     return NumVectorInstToHideOverhead;
 
   // In many cases the address computation is not merged into the instruction
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index d83228afb0a..731a5adf3d7 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -104,7 +104,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
 
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
-  int getAddressComputationCost(Type *Val, bool IsComplex);
+  int getAddressComputationCost(Type *Val, ScalarEvolution *SE, 
+                                const SCEV *Ptr);
 
   int getFPOpCost(Type *Ty);
 
diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 903f92a0443..57ead973b56 100644
--- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -8,23 +8,41 @@
 //===----------------------------------------------------------------------===//
 
 #include "Lanai.h"
+#include "LanaiAluCode.h"
+#include "LanaiCondCode.h"
 #include "MCTargetDesc/LanaiMCExpr.h"
-#include "MCTargetDesc/LanaiMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 
 namespace llvm {
+
+// Auto-generated by TableGen
+static unsigned MatchRegisterName(StringRef Name);
+
 namespace {
+
 struct LanaiOperand;
 
 class LanaiAsmParser : public MCTargetAsmParser {
@@ -80,9 +98,6 @@ class LanaiAsmParser : public MCTargetAsmParser {
   const MCSubtargetInfo &SubtargetInfo;
 };
 
-// Auto-generated by TableGen
-static unsigned MatchRegisterName(llvm::StringRef Name);
-
 // LanaiOperand - Instances of this class represented a parsed machine
 // instruction
 struct LanaiOperand : public MCParsedAsmOperand {
@@ -627,6 +642,8 @@ struct LanaiOperand : public MCParsedAsmOperand {
   }
 };
 
+} // end anonymous namespace
+
 bool LanaiAsmParser::ParseDirective(AsmToken /*DirectiveId*/) { return true; }
 
 bool LanaiAsmParser::MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
@@ -680,11 +697,11 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseRegister() {
   if (Lexer.getKind() == AsmToken::Identifier) {
     RegNum = MatchRegisterName(Lexer.getTok().getIdentifier());
     if (RegNum == 0)
-      return 0;
+      return nullptr;
     Parser.Lex(); // Eat identifier token
     return LanaiOperand::createReg(RegNum, Start, End);
   }
-  return 0;
+  return nullptr;
 }
 
 bool LanaiAsmParser::ParseRegister(unsigned &RegNum, SMLoc &StartLoc,
@@ -701,15 +718,15 @@ bool LanaiAsmParser::ParseRegister(unsigned &RegNum, SMLoc &StartLoc,
 std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
   SMLoc Start = Parser.getTok().getLoc();
   SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-  const MCExpr *Res, *RHS = 0;
+  const MCExpr *Res, *RHS = nullptr;
   LanaiMCExpr::VariantKind Kind = LanaiMCExpr::VK_Lanai_None;
 
   if (Lexer.getKind() != AsmToken::Identifier)
-    return 0;
+    return nullptr;
 
   StringRef Identifier;
   if (Parser.parseIdentifier(Identifier))
-    return 0;
+    return nullptr;
 
   // Check if identifier has a modifier
   if (Identifier.equals_lower("hi"))
@@ -722,24 +739,24 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
   if (Kind != LanaiMCExpr::VK_Lanai_None) {
     if (Lexer.getKind() != AsmToken::LParen) {
       Error(Lexer.getLoc(), "Expected '('");
-      return 0;
+      return nullptr;
     }
     Lexer.Lex(); // lex '('
 
     // Parse identifier
     if (Parser.parseIdentifier(Identifier))
-      return 0;
+      return nullptr;
   }
 
   // If addition parse the RHS.
   if (Lexer.getKind() == AsmToken::Plus && Parser.parseExpression(RHS))
-    return 0;
+    return nullptr;
 
   // For variants parse the final ')'
   if (Kind != LanaiMCExpr::VK_Lanai_None) {
     if (Lexer.getKind() != AsmToken::RParen) {
       Error(Lexer.getLoc(), "Expected ')'");
-      return 0;
+      return nullptr;
     }
     Lexer.Lex(); // lex ')'
   }
@@ -771,7 +788,7 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseImmediate() {
     if (!Parser.parseExpression(ExprVal))
       return LanaiOperand::createImm(ExprVal, Start, End);
   default:
-    return 0;
+    return nullptr;
   }
 }
 
@@ -1204,10 +1221,9 @@ bool LanaiAsmParser::ParseInstruction(ParseInstructionInfo & /*Info*/,
 #define GET_REGISTER_MATCHER
 #define GET_MATCHER_IMPLEMENTATION
 #include "LanaiGenAsmMatcher.inc"
-} // namespace
 
 extern "C" void LLVMInitializeLanaiAsmParser() {
   RegisterMCAsmParser<LanaiAsmParser> x(getTheLanaiTarget());
 }
 
-} // namespace llvm
+} // end namespace llvm
diff --git a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
index a317cd88ad6..e0c19e8ea64 100644
--- a/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
+++ b/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
@@ -20,14 +20,11 @@
 
 namespace llvm {
 
-class MCInst;
-class raw_ostream;
-
 class LanaiDisassembler : public MCDisassembler {
 public:
   LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx);
 
-  ~LanaiDisassembler() override {}
+  ~LanaiDisassembler() override = default;
 
   // getInstruction - See MCDisassembler.
   MCDisassembler::DecodeStatus
@@ -36,6 +33,6 @@ class LanaiDisassembler : public MCDisassembler {
                  raw_ostream &CStream) const override;
 };
 
-} // namespace llvm
+} // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
diff --git a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
index 1c9d186ad81..59904fbaa31 100644
--- a/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
+++ b/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
@@ -14,10 +14,10 @@
 #ifndef LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
 #define LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
-class MCOperand;
 
 class LanaiInstPrinter : public MCInstPrinter {
 public:
@@ -28,14 +28,14 @@ class LanaiInstPrinter : public MCInstPrinter {
   void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
                  const MCSubtargetInfo &STI) override;
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = nullptr);
   void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMemRiOperand(const MCInst *MI, int OpNo, raw_ostream &O,
-                         const char *Modifier = 0);
+                         const char *Modifier = nullptr);
   void printMemRrOperand(const MCInst *MI, int OpNo, raw_ostream &O,
-                         const char *Modifier = 0);
+                         const char *Modifier = nullptr);
   void printMemSplsOperand(const MCInst *MI, int OpNo, raw_ostream &O,
-                           const char *Modifier = 0);
+                           const char *Modifier = nullptr);
   void printCCOperand(const MCInst *MI, int OpNo, raw_ostream &O);
   void printAluOperand(const MCInst *MI, int OpNo, raw_ostream &O);
   void printHi16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -60,6 +60,7 @@ class LanaiInstPrinter : public MCInstPrinter {
   bool printMemoryStoreIncrement(const MCInst *MI, raw_ostream &Ostream,
                                  StringRef Opcode, int AddOffset);
 };
-} // namespace llvm
+
+} // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
index ae7870e07d4..d156294a0b0 100644
--- a/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -11,31 +11,46 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "LanaiISelLowering.h"
-
 #include "Lanai.h"
+#include "LanaiCondCode.h"
+#include "LanaiISelLowering.h"
 #include "LanaiMachineFunctionInfo.h"
 #include "LanaiSubtarget.h"
-#include "LanaiTargetMachine.h"
 #include "LanaiTargetObjectFile.h"
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <utility>
 
 #define DEBUG_TYPE "lanai-lower"
 
@@ -195,6 +210,7 @@ SDValue LanaiTargetLowering::LowerOperation(SDValue Op,
     llvm_unreachable("unimplemented operand");
   }
 }
+
 //===----------------------------------------------------------------------===//
 //                       Lanai Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -244,7 +260,7 @@ LanaiTargetLowering::getSingleConstraintMatchWeight(
   Value *CallOperandVal = Info.CallOperandVal;
   // If we don't have a value, we can't do a match,
   // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (CallOperandVal == nullptr)
     return CW_Default;
   // Look at the constraint type.
   switch (*Constraint) {
@@ -270,7 +286,7 @@ LanaiTargetLowering::getSingleConstraintMatchWeight(
 void LanaiTargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result(nullptr, 0);
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1)
@@ -676,7 +692,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo(
     } else {
       assert(VA.isMemLoc());
 
-      if (StackPtr.getNode() == 0)
+      if (StackPtr.getNode() == nullptr)
         StackPtr = DAG.getCopyFromReg(Chain, DL, Lanai::SP,
                                       getPointerTy(DAG.getDataLayout()));
 
@@ -1120,7 +1136,7 @@ const char *LanaiTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case LanaiISD::SMALL:
     return "LanaiISD::SMALL";
   default:
-    return NULL;
+    return nullptr;
   }
 }
 
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.h b/lib/Target/Lanai/LanaiRegisterInfo.h
index 8b84bbc460e..c6e459076eb 100644
--- a/lib/Target/Lanai/LanaiRegisterInfo.h
+++ b/lib/Target/Lanai/LanaiRegisterInfo.h
@@ -21,9 +21,6 @@
 
 namespace llvm {
 
-class TargetInstrInfo;
-class Type;
-
 struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
   LanaiRegisterInfo();
 
@@ -32,7 +29,7 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
 
   // Code Generation virtual methods.
   const uint16_t *
-  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
@@ -42,7 +39,7 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                            unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
+                           RegScavenger *RS = nullptr) const override;
 
   bool canRealignStack(const MachineFunction &MF) const override;
 
@@ -58,6 +55,6 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
   int getDwarfRegNum(unsigned RegNum, bool IsEH) const;
 };
 
-} // namespace llvm
+} // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
index e30d5e9a18e..e02bba529bd 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -9,20 +9,19 @@
 
 #include "MCTargetDesc/LanaiBaseInfo.h"
 #include "MCTargetDesc/LanaiFixupKinds.h"
-#include "MCTargetDesc/LanaiMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 namespace {
+
 class LanaiELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   explicit LanaiELFObjectWriter(uint8_t OSABI);
 
-  ~LanaiELFObjectWriter() override;
+  ~LanaiELFObjectWriter() override = default;
 
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
@@ -30,14 +29,13 @@ class LanaiELFObjectWriter : public MCELFObjectTargetWriter {
   bool needsRelocateWithSymbol(const MCSymbol &SD,
                                unsigned Type) const override;
 };
-} // namespace
+
+} // end anonymous namespace
 
 LanaiELFObjectWriter::LanaiELFObjectWriter(uint8_t OSABI)
     : MCELFObjectTargetWriter(/*Is64Bit_=*/false, OSABI, ELF::EM_LANAI,
                               /*HasRelocationAddend=*/true) {}
 
-LanaiELFObjectWriter::~LanaiELFObjectWriter() {}
-
 unsigned LanaiELFObjectWriter::getRelocType(MCContext & /*Ctx*/,
                                             const MCValue & /*Target*/,
                                             const MCFixup &Fixup,
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index ce68b7e24db..f5b5335bb98 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -12,37 +12,38 @@
 //===----------------------------------------------------------------------===//
 
 #include "Lanai.h"
+#include "LanaiAluCode.h"
 #include "MCTargetDesc/LanaiBaseInfo.h"
 #include "MCTargetDesc/LanaiFixupKinds.h"
 #include "MCTargetDesc/LanaiMCExpr.h"
-#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 #define DEBUG_TYPE "mccodeemitter"
 
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace llvm {
+
 namespace {
-class LanaiMCCodeEmitter : public MCCodeEmitter {
-  LanaiMCCodeEmitter(const LanaiMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const LanaiMCCodeEmitter &);     // DO NOT IMPLEMENT
-  const MCInstrInfo &InstrInfo;
-  MCContext &Context;
 
+class LanaiMCCodeEmitter : public MCCodeEmitter {
 public:
-  LanaiMCCodeEmitter(const MCInstrInfo &MCII, MCContext &C)
-      : InstrInfo(MCII), Context(C) {}
-
-  ~LanaiMCCodeEmitter() override {}
+  LanaiMCCodeEmitter(const MCInstrInfo &MCII, MCContext &C) {}
+  LanaiMCCodeEmitter(const LanaiMCCodeEmitter &) = delete;
+  void operator=(const LanaiMCCodeEmitter &) = delete;
+  ~LanaiMCCodeEmitter() override = default;
 
   // The functions below are called by TableGen generated functions for getting
   // the binary encoding of instructions/opereands.
@@ -86,6 +87,8 @@ class LanaiMCCodeEmitter : public MCCodeEmitter {
                             const MCSubtargetInfo &STI) const;
 };
 
+} // end anonymous namespace
+
 Lanai::Fixups FixupKind(const MCExpr *Expr) {
   if (isa<MCSymbolRefExpr>(Expr))
     return Lanai::FIXUP_LANAI_21;
@@ -298,8 +301,8 @@ unsigned LanaiMCCodeEmitter::getBranchTargetOpValue(
 }
 
 #include "LanaiGenMCCodeEmitter.inc"
-} // namespace
-} // namespace llvm
+
+} // end namespace llvm
 
 llvm::MCCodeEmitter *
 llvm::createLanaiMCCodeEmitter(const MCInstrInfo &InstrInfo,
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index c2f8c0f7ad5..a47ff9ff3d6 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -11,16 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "LanaiMCAsmInfo.h"
 #include "LanaiMCTargetDesc.h"
-
 #include "InstPrinter/LanaiInstPrinter.h"
-#include "LanaiMCAsmInfo.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cstdint>
+#include <string>
 
 #define GET_INSTRINFO_MC_DESC
 #include "LanaiGenInstrInfo.inc"
@@ -70,7 +75,7 @@ static MCInstPrinter *createLanaiMCInstPrinter(const Triple & /*T*/,
                                                const MCRegisterInfo &MRI) {
   if (SyntaxVariant == 0)
     return new LanaiInstPrinter(MAI, MII, MRI);
-  return 0;
+  return nullptr;
 }
 
 static MCRelocationInfo *createLanaiElfRelocation(const Triple &TheTriple,
@@ -79,6 +84,7 @@ static MCRelocationInfo *createLanaiElfRelocation(const Triple &TheTriple,
 }
 
 namespace {
+
 class LanaiMCInstrAnalysis : public MCInstrAnalysis {
 public:
   explicit LanaiMCInstrAnalysis(const MCInstrInfo *Info)
@@ -107,6 +113,7 @@ class LanaiMCInstrAnalysis : public MCInstrAnalysis {
     }
   }
 };
+
 } // end anonymous namespace
 
 static MCInstrAnalysis *createLanaiInstrAnalysis(const MCInstrInfo *Info) {
@@ -131,7 +138,7 @@ extern "C" void LLVMInitializeLanaiTargetMC() {
 
   // Register the MC code emitter
   TargetRegistry::RegisterMCCodeEmitter(getTheLanaiTarget(),
-                                        llvm::createLanaiMCCodeEmitter);
+                                        createLanaiMCCodeEmitter);
 
   // Register the ASM Backend
   TargetRegistry::RegisterMCAsmBackend(getTheLanaiTarget(),
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index d3c88482f09..05acd25ae5f 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -47,7 +47,7 @@ namespace llvm {
       FCTIDZ, FCTIWZ,
 
       /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
-      /// unsigned integers.
+      /// unsigned integers with round toward zero.
       FCTIDUZ, FCTIWUZ,
 
       /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 03b2257a88a..fbec8787ef8 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1154,6 +1154,9 @@ defm FCFID  : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
 defm FCTID  : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctid", "$frD, $frB", IIC_FPGeneral,
                         []>, isPPC64;
+defm FCTIDU : XForm_26r<63, 942, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctidu", "$frD, $frB", IIC_FPGeneral,
+                        []>, isPPC64;
 defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctidz", "$frD, $frB", IIC_FPGeneral,
                         [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 99689f656c2..ef7d2012a23 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -603,6 +603,12 @@ class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = 0;
 }
 
+class XForm_17a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+  : XForm_17<opcode, xo, OOL, IOL, asmstr, itin > {
+  let FRA = 0;
+}
+
 // Used for QPX
 class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                InstrItinClass itin, list<dag> pattern>
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index a7231bd2e2c..90111bbea07 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -2172,11 +2172,19 @@ let isCompare = 1, hasSideEffects = 0 in {
                         "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
 }
 
+def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
+                      "ftdiv $crD, $fA, $fB", IIC_FPCompare>;
+def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB),
+                      "ftsqrt $crD, $fB", IIC_FPCompare>;
+
 let Uses = [RM] in {
   let hasSideEffects = 0 in {
   defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fctiw", "$frD, $frB", IIC_FPGeneral,
                           []>;
+  defm FCTIWU  : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiwu", "$frD, $frB", IIC_FPGeneral,
+                          []>;
   defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fctiwz", "$frD, $frB", IIC_FPGeneral,
                           [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index fd218939727..7f72ab17f61 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16985,10 +16985,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
   }
 
-  if (Cond.getOpcode() == ISD::SETCC) {
+  if (Cond.getOpcode() == ISD::SETCC)
     if (SDValue NewCond = LowerSETCC(Cond, DAG))
       Cond = NewCond;
-  }
 
   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
@@ -18289,6 +18288,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
 /// constant. Takes immediate version of shift as input.
 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
+                                   const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   MVT SVT = ShAmt.getSimpleValueType();
   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
@@ -18306,27 +18306,32 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   }
 
-  const X86Subtarget &Subtarget =
-      static_cast<const X86Subtarget &>(DAG.getSubtarget());
-  if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
-      ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
-    // Let the shuffle legalizer expand this shift amount node.
+  // Need to build a vector containing shift amount.
+  // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+  // +=================+============+=======================================+
+  // | ShAmt is        | HasSSE4.1? | Construct ShAmt vector as             |
+  // +=================+============+=======================================+
+  // | i64             | Yes, No    | Use ShAmt as lowest elt               |
+  // | i32             | Yes        | zero-extend in-reg                    |
+  // | (i32 zext(i16)) | Yes        | zero-extend in-reg                    |
+  // | i16/i32         | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
+  // +=================+============+=======================================+
+
+  if (SVT == MVT::i64)
+    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+  else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+           ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
     SDValue Op0 = ShAmt.getOperand(0);
     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
-    ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
+    ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64);
+  } else if (Subtarget.hasSSE41() &&
+             ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
+    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else {
-    // Need to build a vector containing shift amount.
-    // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
-    SmallVector<SDValue, 4> ShOps;
-    ShOps.push_back(ShAmt);
-    if (SVT == MVT::i32) {
-      ShOps.push_back(DAG.getConstant(0, dl, SVT));
-      ShOps.push_back(DAG.getUNDEF(SVT));
-    }
-    ShOps.push_back(DAG.getUNDEF(SVT));
-
-    MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
-    ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
+    SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
+                                     DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
+    ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
   }
 
   // The return type has to be a 128-bit type with the same element
@@ -19014,7 +19019,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
     }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
-                                 Op.getOperand(1), Op.getOperand(2), DAG);
+                                 Op.getOperand(1), Op.getOperand(2), Subtarget,
+                                 DAG);
     case COMPRESS_EXPAND_IN_REG: {
       SDValue Mask = Op.getOperand(3);
       SDValue DataToCompress = Op.getOperand(1);
@@ -21276,7 +21282,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
       else if (EltVT.bitsLT(MVT::i32))
         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
 
-      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
+      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
     }
   }
 
@@ -25951,12 +25957,11 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+                                    bool FloatDomain,
                                     const X86Subtarget &Subtarget,
                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
-  bool FloatDomain = MaskVT.isFloatingPoint() ||
-                     (!Subtarget.hasAVX2() && MaskVT.is256BitVector());
 
   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
@@ -26067,11 +26072,11 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 // permute instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+                                           bool FloatDomain,
                                            const X86Subtarget &Subtarget,
                                            unsigned &Shuffle, MVT &ShuffleVT,
                                            unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
-  bool FloatDomain = MaskVT.isFloatingPoint();
 
   bool ContainsZeros = false;
   SmallBitVector Zeroable(NumMaskElts, false);
@@ -26211,11 +26216,10 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 // shuffle instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                     SDValue &V1, SDValue &V2,
+                                     bool FloatDomain, SDValue &V1, SDValue &V2,
                                      const X86Subtarget &Subtarget,
                                      unsigned &Shuffle, MVT &ShuffleVT,
                                      bool IsUnary) {
-  bool FloatDomain = MaskVT.isFloatingPoint();
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   if (MaskVT.is128BitVector()) {
@@ -26310,13 +26314,13 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 }
 
 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+                                            bool FloatDomain,
                                             SDValue &V1, SDValue &V2,
                                             SDLoc &DL, SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget,
                                             unsigned &Shuffle, MVT &ShuffleVT,
                                             unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
-  bool FloatDomain = MaskVT.isFloatingPoint();
 
   // Attempt to match against PALIGNR byte rotate.
   if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
@@ -26594,8 +26598,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       }
     }
 
-    if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleSrcVT,
-                                ShuffleVT)) {
+    if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
+                                ShuffleSrcVT, ShuffleVT)) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return false; // Nothing to do!
       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26609,8 +26613,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       return true;
     }
 
-    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle,
-                                       ShuffleVT, PermuteImm)) {
+    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
+                                       Shuffle, ShuffleVT, PermuteImm)) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return false; // Nothing to do!
       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26626,8 +26630,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
   }
 
-  if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Subtarget, Shuffle,
-                               ShuffleVT, UnaryShuffle)) {
+  if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
+                               Shuffle, ShuffleVT, UnaryShuffle)) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return false; // Nothing to do!
     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26643,8 +26647,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     return true;
   }
 
-  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, V1, V2, DL, DAG, Subtarget,
-                                      Shuffle, ShuffleVT, PermuteImm)) {
+  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
+                                      DAG, Subtarget, Shuffle, ShuffleVT,
+                                      PermuteImm)) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return false; // Nothing to do!
     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -28742,6 +28747,27 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
                               DAG.getConstant(Imm, DL, MVT::i8)));
     return true;
   }
+  case ISD::EXTRACT_SUBVECTOR: {
+    unsigned EltSize = EltVT.getSizeInBits();
+    if (EltSize != 32 && EltSize != 64)
+      return false;
+    MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
+    // Only change element size, not type.
+    if (VT.isInteger() != OpEltVT.isInteger())
+      return false;
+    uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
+    // Op0 needs to be bitcasted to a larger vector with the same element type.
+    SDValue Op0 = Op.getOperand(0);
+    MVT Op0VT = MVT::getVectorVT(EltVT,
+                            Op0.getSimpleValueType().getSizeInBits() / EltSize);
+    Op0 = DAG.getBitcast(Op0VT, Op0);
+    DCI.AddToWorklist(Op0.getNode());
+    DCI.CombineTo(OrigOp.getNode(),
+                  DAG.getNode(Opcode, DL, VT, Op0,
+                              DAG.getConstant(Imm, DL, MVT::i8)));
+    return true;
+  }
   }
 
   return false;
@@ -30921,6 +30947,59 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
 }
 
+/// Check if truncation with saturation form type \p SrcVT to \p DstVT
+/// is valid for the given \p Subtarget.
+static bool
+isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasAVX512())
+    return false;
+  EVT SrcElVT = SrcVT.getScalarType();
+  EVT DstElVT = DstVT.getScalarType();
+  if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
+    return false;
+  if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
+    return false;
+  if (SrcVT.is512BitVector() || Subtarget.hasVLX())
+    return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
+  return false;
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched or the unsupported on the current target.
+static SDValue
+detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) {
+  if (In.getOpcode() != ISD::UMIN)
+    return SDValue();
+
+  EVT InVT = In.getValueType();
+  // FIXME: Scalar type may be supported if we move it to vector register.
+  if (!InVT.isVector() || !InVT.isSimple())
+    return SDValue();
+
+  if (!isSATValidOnSubtarget(InVT, VT, Subtarget))
+    return SDValue();
+
+  //Saturation with truncation. We truncate from InVT to VT.
+  assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
+    "Unexpected types for truncate operation");
+
+  SDValue SrcVal;
+  APInt C;
+  if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C))
+    SrcVal = In.getOperand(1);
+  else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C))
+    SrcVal = In.getOperand(0);
+  else
+    return SDValue();
+
+  // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+  // the element size of the destination type.
+  return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ?
+    SrcVal : SDValue();
+}
+
 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
 /// X86ISD::AVG instruction.
@@ -31487,6 +31566,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                           St->getPointerInfo(), St->getAlignment(),
                           St->getMemOperand()->getFlags());
 
+    if (SDValue Val =
+        detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+      return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
+                             dl, Val, St->getBasePtr(),
+                             St->getMemoryVT(), St->getMemOperand(), DAG);
+
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
@@ -31967,7 +32052,8 @@ combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
 
 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
 static SDValue
-combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
+combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
+                                  SelectionDAG &DAG,
                                   SmallVector<SDValue, 8> &Regs) {
   assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
   EVT OutVT = N->getValueType(0);
@@ -31976,8 +32062,10 @@ combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
   // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
   SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
   for (auto &Reg : Regs) {
-    Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
-    Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+    Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
+                              Subtarget, DAG);
+    Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
+                              Subtarget, DAG);
   }
 
   for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
@@ -32046,7 +32134,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
   else if (InSVT == MVT::i32)
-    return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
+    return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
   else
     return SDValue();
 }
@@ -32104,6 +32192,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
+  // Try the truncation with unsigned saturation.
+  if (SDValue Val = detectUSatPattern(Src, VT, Subtarget))
+    return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val);
+
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index d7792e296a5..de4839432b9 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -80,9 +80,12 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
 
 unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
   if (Vector) {
-    if (ST->hasAVX512()) return 512;
-    if (ST->hasAVX()) return 256;
-    if (ST->hasSSE1()) return 128;
+    if (ST->hasAVX512())
+      return 512;
+    if (ST->hasAVX())
+      return 256;
+    if (ST->hasSSE1())
+      return 128;
     return 0;
   }
 
@@ -211,11 +214,9 @@ int X86TTIImpl::getArithmeticInstrCost(
   };
 
   // Look for AVX512DQ lowering tricks for custom cases.
-  if (ST->hasDQI()) {
-    if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD,
-                                            LT.second))
+  if (ST->hasDQI())
+    if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVX512BWCostTable[] = {
     { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
@@ -225,37 +226,38 @@ int X86TTIImpl::getArithmeticInstrCost(
     // Vectorizing division is a bad idea. See the SSE2 table for more comments.
     { ISD::SDIV,  MVT::v64i8,  64*20 },
     { ISD::SDIV,  MVT::v32i16, 32*20 },
-    { ISD::SDIV,  MVT::v16i32, 16*20 },
-    { ISD::SDIV,  MVT::v8i64,   8*20 },
     { ISD::UDIV,  MVT::v64i8,  64*20 },
-    { ISD::UDIV,  MVT::v32i16, 32*20 },
-    { ISD::UDIV,  MVT::v16i32, 16*20 },
-    { ISD::UDIV,  MVT::v8i64,   8*20 },
+    { ISD::UDIV,  MVT::v32i16, 32*20 }
   };
 
   // Look for AVX512BW lowering tricks for custom cases.
-  if (ST->hasBWI()) {
-    if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD,
-                                            LT.second))
+  if (ST->hasBWI())
+    if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVX512CostTable[] = {
-    { ISD::SHL,     MVT::v16i32,    1 },
-    { ISD::SRL,     MVT::v16i32,    1 },
-    { ISD::SRA,     MVT::v16i32,    1 },
-    { ISD::SHL,     MVT::v8i64,     1 },
-    { ISD::SRL,     MVT::v8i64,     1 },
-    { ISD::SRA,     MVT::v8i64,     1 },
-
-    { ISD::MUL,     MVT::v32i8,    13 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,     MVT::v16i8,     5 }, // extend/pmullw/trunc sequence.
+    { ISD::SHL,     MVT::v16i32,     1 },
+    { ISD::SRL,     MVT::v16i32,     1 },
+    { ISD::SRA,     MVT::v16i32,     1 },
+    { ISD::SHL,     MVT::v8i64,      1 },
+    { ISD::SRL,     MVT::v8i64,      1 },
+    { ISD::SRA,     MVT::v8i64,      1 },
+
+    { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,     MVT::v16i8,      5 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,     MVT::v16i32,     1 }, // pmulld
+    { ISD::MUL,     MVT::v8i64,      8 }, // 3*pmuludq/3*shift/2*add
+
+    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+    { ISD::SDIV,    MVT::v16i32, 16*20 },
+    { ISD::SDIV,    MVT::v8i64,   8*20 },
+    { ISD::UDIV,    MVT::v16i32, 16*20 },
+    { ISD::UDIV,    MVT::v8i64,   8*20 }
   };
 
-  if (ST->hasAVX512()) {
+  if (ST->hasAVX512())
     if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVX2CostTable[] = {
     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
@@ -315,10 +317,9 @@ int X86TTIImpl::getArithmeticInstrCost(
   };
 
   // Look for XOP lowering tricks.
-  if (ST->hasXOP()) {
+  if (ST->hasXOP())
     if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVX2CustomCostTable[] = {
     { ISD::SHL,  MVT::v32i8,      11 }, // vpblendvb sequence.
@@ -334,6 +335,8 @@ int X86TTIImpl::getArithmeticInstrCost(
 
     { ISD::MUL,   MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,   MVT::v8i32,      1 }, // pmulld
+    { ISD::MUL,   MVT::v4i64,      8 }, // 3*pmuludq/3*shift/2*add
 
     { ISD::FDIV,  MVT::f32,        7 }, // Haswell from http://www.agner.org/
     { ISD::FDIV,  MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
@@ -344,11 +347,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   };
 
   // Look for AVX2 lowering tricks for custom cases.
-  if (ST->hasAVX2()) {
+  if (ST->hasAVX2())
     if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
                                             LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVXCustomCostTable[] = {
     { ISD::MUL,   MVT::v32i8,  26 }, // extend/pmullw/trunc sequence.
@@ -372,24 +374,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   };
 
   // Look for AVX2 lowering tricks for custom cases.
-  if (ST->hasAVX()) {
+  if (ST->hasAVX())
     if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD,
                                             LT.second))
       return LT.first * Entry->Cost;
-  }
-
-  static const CostTblEntry SSE42FloatCostTable[] = {
-    { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
-    { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
-    { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
-    { ISD::FDIV,  MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
-  };
-
-  if (ST->hasSSE42()) {
-    if (const auto *Entry = CostTableLookup(SSE42FloatCostTable, ISD,
-                                            LT.second))
-      return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry
   SSE2UniformCostTable[] = {
@@ -452,6 +440,17 @@ int X86TTIImpl::getArithmeticInstrCost(
       ISD = ISD::MUL;
   }
 
+  static const CostTblEntry SSE42CostTable[] = {
+    { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
+    { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
+    { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
+    { ISD::FDIV,  MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
+  };
+
+  if (ST->hasSSE42())
+    if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+
   static const CostTblEntry SSE41CostTable[] = {
     { ISD::SHL,  MVT::v16i8,    11 }, // pblendvb sequence.
     { ISD::SHL,  MVT::v32i8,  2*11 }, // pblendvb sequence.
@@ -471,44 +470,39 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v16i16, 2*14 }, // pblendvb sequence.
     { ISD::SRA,  MVT::v4i32,    12 }, // Shift each lane + blend.
     { ISD::SRA,  MVT::v8i32,  2*12 }, // Shift each lane + blend.
+
+    { ISD::MUL,  MVT::v4i32,     1 }  // pmulld
   };
 
-  if (ST->hasSSE41()) {
+  if (ST->hasSSE41())
     if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
     { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
     { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
     { ISD::SHL,  MVT::v4i32,   2*5 }, // We optimized this using mul.
     { ISD::SHL,  MVT::v8i32, 2*2*5 }, // We optimized this using mul.
     { ISD::SHL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
     { ISD::SHL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
 
     { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
     { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
     { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v8i32,  2*16 }, // Shift each lane + blend.
     { ISD::SRL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
     { ISD::SRL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
 
     { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
-    { ISD::SRA,  MVT::v32i8,  2*54 }, // unpacked cmpgtb sequence.
     { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRA,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
     { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v8i32,  2*16 }, // Shift each lane + blend.
     { ISD::SRA,  MVT::v2i64,    12 }, // srl/xor/sub sequence.
     { ISD::SRA,  MVT::v4i64,  2*12 }, // srl/xor/sub sequence.
 
     { ISD::MUL,  MVT::v16i8,    12 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v4i32,     6 }, // 3*pmuludq/4*shuffle
+    { ISD::MUL,  MVT::v2i64,     8 }, // 3*pmuludq/3*shift/2*add
 
     { ISD::FDIV, MVT::f32,      23 }, // Pentium IV from http://www.agner.org/
     { ISD::FDIV, MVT::v4f32,    39 }, // Pentium IV from http://www.agner.org/
@@ -531,10 +525,9 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::UDIV,  MVT::v2i64,  2*20 },
   };
 
-  if (ST->hasSSE2()) {
+  if (ST->hasSSE2())
     if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
   static const CostTblEntry AVX1CostTable[] = {
     // We don't have to scalarize unsupported ops. We can issue two half-sized
@@ -553,307 +546,278 @@ int X86TTIImpl::getArithmeticInstrCost(
     // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
     // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
     // Because we believe v4i64 to be a legal type, we must also include the
-    // split factor of two in the cost table. Therefore, the cost here is 16
+    // extract+insert in the cost table. Therefore, the cost here is 18
     // instead of 8.
-    { ISD::MUL,     MVT::v4i64,    16 },
+    { ISD::MUL,     MVT::v4i64,    18 },
   };
 
   // Look for AVX1 lowering tricks.
-  if (ST->hasAVX() && !ST->hasAVX2()) {
-    MVT VT = LT.second;
-
-    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT))
+  if (ST->hasAVX() && !ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
-  }
 
-  // Custom lowering of vectors.
-  static const CostTblEntry CustomLowered[] = {
-    // A v2i64/v4i64 and multiply is custom lowered as a series of long
-    // multiplies(3), shifts(3) and adds(2).
-    { ISD::MUL,     MVT::v2i64,    8 },
-    { ISD::MUL,     MVT::v4i64,    8 },
-    { ISD::MUL,     MVT::v8i64,    8 }
-  };
-  if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second))
-    return LT.first * Entry->Cost;
-
-  // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
-  // 2x pmuludq, 2x shuffle.
-  if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
-      !ST->hasSSE41())
-    return LT.first * 6;
-
-  static const CostTblEntry SSE1FloatCostTable[] = {
+  static const CostTblEntry SSE1CostTable[] = {
     { ISD::FDIV, MVT::f32,   17 }, // Pentium III from http://www.agner.org/
     { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
   };
 
   if (ST->hasSSE1())
-    if (const auto *Entry = CostTableLookup(SSE1FloatCostTable, ISD,
-                                            LT.second))
+    if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
+
   // Fallback to the default implementation.
   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
 }
 
 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
-  if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) {
-    // 64-bit packed float vectors (v2f32) are widened to type v4f32.
-    // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
-    static const CostTblEntry AVX512VBMIShuffleTbl[] = {
-      { TTI::SK_Reverse, MVT::v64i8,  1 }, // vpermb
-      { TTI::SK_Reverse, MVT::v32i8,  1 }  // vpermb
-    };
-
-    if (ST->hasVBMI())
-      if (const auto *Entry =
-              CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+  // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+  // For Broadcasts we are splatting the first element from the first input
+  // register, so only need to reference that input and all the output
+  // registers are the same.
+  if (Kind == TTI::SK_Broadcast)
+    LT.first = 1;
+
+  // We are going to permute multiple sources and the result will be in multiple
+  // destinations. Providing an accurate cost only for splits where the element
+  // type remains the same.
+  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
+    MVT LegalVT = LT.second;
+    if (LegalVT.getVectorElementType().getSizeInBits() ==
+            Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
+        LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+
+      unsigned VecTySize = DL.getTypeStoreSize(Tp);
+      unsigned LegalVTSize = LegalVT.getStoreSize();
+      // Number of source vectors after legalization:
+      unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+      // Number of destination vectors after legalization:
+      unsigned NumOfDests = LT.first;
+
+      Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
+                                         LegalVT.getVectorNumElements());
+
+      unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+      return NumOfShuffles *
+             getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+    }
 
-    static const CostTblEntry AVX512BWShuffleTbl[] = {
-      { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
-      { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
-      { TTI::SK_Reverse, MVT::v64i8,  6 }  // vextracti64x4 + 2*vperm2i128
-                                           // + 2*pshufb + vinserti64x4
-    };
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  }
 
-    if (ST->hasBWI())
-      if (const auto *Entry =
-              CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
+  if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
+    // We assume that source and destination have the same vector type.
+    int NumOfDests = LT.first;
+    int NumOfShufflesPerDest = LT.first * 2 - 1;
+    LT.first = NumOfDests * NumOfShufflesPerDest;
+  }
 
-    static const CostTblEntry AVX512ShuffleTbl[] = {
-      { TTI::SK_Reverse, MVT::v8f64,  1 }, // vpermpd
-      { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
-      { TTI::SK_Reverse, MVT::v8i64,  1 }, // vpermq
-      { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
-    };
+  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+    { TTI::SK_Reverse,          MVT::v64i8,  1 }, // vpermb
+    { TTI::SK_Reverse,          MVT::v32i8,  1 }, // vpermb
 
-    if (ST->hasAVX512())
-      if (const auto *Entry =
-              CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+    { TTI::SK_PermuteSingleSrc, MVT::v64i8,  1 }, // vpermb
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  1 }, // vpermb
 
-    static const CostTblEntry AVX2ShuffleTbl[] = {
-      { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
-      { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
-      { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
-      { TTI::SK_Reverse,   MVT::v8i32,  1 }, // vpermd
-      { TTI::SK_Reverse,   MVT::v16i16, 2 }, // vperm2i128 + pshufb
-      { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
+    { TTI::SK_PermuteTwoSrc,    MVT::v64i8,  1 }, // vpermt2b
+    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  1 }, // vpermt2b
+    { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  1 }  // vpermt2b
+  };
 
-      { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
-      { TTI::SK_Alternate, MVT::v32i8,  1 }  // vpblendvb
-    };
+  if (ST->hasVBMI())
+    if (const auto *Entry =
+            CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-    if (ST->hasAVX2())
-      if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  static const CostTblEntry AVX512BWShuffleTbl[] = {
+    { TTI::SK_Broadcast,        MVT::v32i16, 1 }, // vpbroadcastw
+    { TTI::SK_Broadcast,        MVT::v64i8,  1 }, // vpbroadcastb
+
+    { TTI::SK_Reverse,          MVT::v32i16, 1 }, // vpermw
+    { TTI::SK_Reverse,          MVT::v16i16, 1 }, // vpermw
+    { TTI::SK_Reverse,          MVT::v64i8,  6 }, // vextracti64x4 + 2*vperm2i128
+                                                  // + 2*pshufb + vinserti64x4
+
+    { TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
+    { TTI::SK_PermuteSingleSrc, MVT::v8i16,  1 }, // vpermw
+    { TTI::SK_PermuteSingleSrc, MVT::v64i8,  8 }, // extend to v32i16
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  3 }, // vpermw + zext/trunc
+
+    { TTI::SK_PermuteTwoSrc,    MVT::v32i16, 1 }, // vpermt2w
+    { TTI::SK_PermuteTwoSrc,    MVT::v16i16, 1 }, // vpermt2w
+    { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  1 }, // vpermt2w
+    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  3 }, // zext + vpermt2w + trunc
+    { TTI::SK_PermuteTwoSrc,    MVT::v64i8, 19 }, // 6 * v32i8 + 1
+    { TTI::SK_PermuteTwoSrc,    MVT::v16i8,  3 }  // zext + vpermt2w + trunc
+  };
 
-    static const CostTblEntry AVX1ShuffleTbl[] = {
-      { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
-      { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
-      { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
-      { TTI::SK_Reverse,   MVT::v8i32,  2 }, // vperm2f128 + vpermilps
-      { TTI::SK_Reverse,   MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
-                                             // + vinsertf128
-      { TTI::SK_Reverse,   MVT::v32i8,  4 }, // vextractf128 + 2*pshufb
-                                             // + vinsertf128
-
-      { TTI::SK_Alternate, MVT::v4i64,  1 }, // vblendpd
-      { TTI::SK_Alternate, MVT::v4f64,  1 }, // vblendpd
-      { TTI::SK_Alternate, MVT::v8i32,  1 }, // vblendps
-      { TTI::SK_Alternate, MVT::v8f32,  1 }, // vblendps
-      { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
-      { TTI::SK_Alternate, MVT::v32i8,  3 }  // vpand + vpandn + vpor
-    };
+  if (ST->hasBWI())
+    if (const auto *Entry =
+            CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-    if (ST->hasAVX())
-      if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  static const CostTblEntry AVX512ShuffleTbl[] = {
+    { TTI::SK_Broadcast,        MVT::v8f64,  1 }, // vbroadcastpd
+    { TTI::SK_Broadcast,        MVT::v16f32, 1 }, // vbroadcastps
+    { TTI::SK_Broadcast,        MVT::v8i64,  1 }, // vpbroadcastq
+    { TTI::SK_Broadcast,        MVT::v16i32, 1 }, // vpbroadcastd
+
+    { TTI::SK_Reverse,          MVT::v8f64,  1 }, // vpermpd
+    { TTI::SK_Reverse,          MVT::v16f32, 1 }, // vpermps
+    { TTI::SK_Reverse,          MVT::v8i64,  1 }, // vpermq
+    { TTI::SK_Reverse,          MVT::v16i32, 1 }, // vpermd
+
+    { TTI::SK_PermuteSingleSrc, MVT::v8f64,  1 }, // vpermpd
+    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  1 }, // vpermpd
+    { TTI::SK_PermuteSingleSrc, MVT::v2f64,  1 }, // vpermpd
+    { TTI::SK_PermuteSingleSrc, MVT::v16f32, 1 }, // vpermps
+    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  1 }, // vpermps
+    { TTI::SK_PermuteSingleSrc, MVT::v4f32,  1 }, // vpermps
+    { TTI::SK_PermuteSingleSrc, MVT::v8i64,  1 }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v2i64,  1 }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v16i32, 1 }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v4i32,  1 }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v16i8,  1 }, // pshufb
+
+    { TTI::SK_PermuteTwoSrc,    MVT::v8f64,  1 }, // vpermt2pd
+    { TTI::SK_PermuteTwoSrc,    MVT::v16f32, 1 }, // vpermt2ps
+    { TTI::SK_PermuteTwoSrc,    MVT::v8i64,  1 }, // vpermt2q
+    { TTI::SK_PermuteTwoSrc,    MVT::v16i32, 1 }, // vpermt2d
+    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,  1 }, // vpermt2pd
+    { TTI::SK_PermuteTwoSrc,    MVT::v8f32,  1 }, // vpermt2ps
+    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,  1 }, // vpermt2q
+    { TTI::SK_PermuteTwoSrc,    MVT::v8i32,  1 }, // vpermt2d
+    { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // vpermt2pd
+    { TTI::SK_PermuteTwoSrc,    MVT::v4f32,  1 }, // vpermt2ps
+    { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // vpermt2q
+    { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  1 }  // vpermt2d
+  };
 
-    static const CostTblEntry SSE41ShuffleTbl[] = {
-      { TTI::SK_Alternate, MVT::v2i64,  1 }, // pblendw
-      { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
-      { TTI::SK_Alternate, MVT::v4i32,  1 }, // pblendw
-      { TTI::SK_Alternate, MVT::v4f32,  1 }, // blendps
-      { TTI::SK_Alternate, MVT::v8i16,  1 }, // pblendw
-      { TTI::SK_Alternate, MVT::v16i8,  1 }  // pblendvb
-    };
-
-    if (ST->hasSSE41())
-      if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  if (ST->hasAVX512())
+    if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-    static const CostTblEntry SSSE3ShuffleTbl[] = {
-      { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
-      { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
+  static const CostTblEntry AVX2ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v4f64,  1 }, // vbroadcastpd
+    { TTI::SK_Broadcast, MVT::v8f32,  1 }, // vbroadcastps
+    { TTI::SK_Broadcast, MVT::v4i64,  1 }, // vpbroadcastq
+    { TTI::SK_Broadcast, MVT::v8i32,  1 }, // vpbroadcastd
+    { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v32i8,  1 }, // vpbroadcastb
+
+    { TTI::SK_Reverse,   MVT::v4f64,  1 }, // vpermpd
+    { TTI::SK_Reverse,   MVT::v8f32,  1 }, // vpermps
+    { TTI::SK_Reverse,   MVT::v4i64,  1 }, // vpermq
+    { TTI::SK_Reverse,   MVT::v8i32,  1 }, // vpermd
+    { TTI::SK_Reverse,   MVT::v16i16, 2 }, // vperm2i128 + pshufb
+    { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
+
+    { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
+    { TTI::SK_Alternate, MVT::v32i8,  1 }  // vpblendvb
+  };
 
-      { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
-      { TTI::SK_Alternate, MVT::v16i8,  3 }  // pshufb + pshufb + por
-    };
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-    if (ST->hasSSSE3())
-      if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  static const CostTblEntry AVX1ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
+    { TTI::SK_Broadcast, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
+    { TTI::SK_Broadcast, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
+    { TTI::SK_Broadcast, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
+    { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128
+    { TTI::SK_Broadcast, MVT::v32i8,  2 }, // vpshufb + vinsertf128
+
+    { TTI::SK_Reverse,   MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
+    { TTI::SK_Reverse,   MVT::v8f32,  2 }, // vperm2f128 + vpermilps
+    { TTI::SK_Reverse,   MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
+    { TTI::SK_Reverse,   MVT::v8i32,  2 }, // vperm2f128 + vpermilps
+    { TTI::SK_Reverse,   MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
+                                           // + vinsertf128
+    { TTI::SK_Reverse,   MVT::v32i8,  4 }, // vextractf128 + 2*pshufb
+                                           // + vinsertf128
+
+    { TTI::SK_Alternate, MVT::v4i64,  1 }, // vblendpd
+    { TTI::SK_Alternate, MVT::v4f64,  1 }, // vblendpd
+    { TTI::SK_Alternate, MVT::v8i32,  1 }, // vblendps
+    { TTI::SK_Alternate, MVT::v8f32,  1 }, // vblendps
+    { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
+    { TTI::SK_Alternate, MVT::v32i8,  3 }  // vpand + vpandn + vpor
+  };
 
-    static const CostTblEntry SSE2ShuffleTbl[] = {
-      { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
-      { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
-      { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
-      { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw  + pshufd
-      { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
-                                             // + 2*pshufd + 2*unpck + packus
-
-      { TTI::SK_Alternate, MVT::v2i64,  1 }, // movsd
-      { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
-      { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
-      { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
-      { TTI::SK_Alternate, MVT::v16i8,  3 }  // pand + pandn + por
-    };
-
-    if (ST->hasSSE2())
-      if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-    static const CostTblEntry SSE1ShuffleTbl[] = {
-      { TTI::SK_Reverse,   MVT::v4f32,  1 }, // shufps
-      { TTI::SK_Alternate, MVT::v4f32,  2 }  // 2*shufps
-    };
+  static const CostTblEntry SSE41ShuffleTbl[] = {
+    { TTI::SK_Alternate, MVT::v2i64,  1 }, // pblendw
+    { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
+    { TTI::SK_Alternate, MVT::v4i32,  1 }, // pblendw
+    { TTI::SK_Alternate, MVT::v4f32,  1 }, // blendps
+    { TTI::SK_Alternate, MVT::v8i16,  1 }, // pblendw
+    { TTI::SK_Alternate, MVT::v16i8,  1 }  // pblendvb
+  };
 
-    if (ST->hasSSE1())
-      if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
-        return LT.first * Entry->Cost;
+  if (ST->hasSSE41())
+    if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
-  } else if (Kind == TTI::SK_PermuteTwoSrc) {
-    // We assume that source and destination have the same vector type.
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-    int NumOfDests = LT.first;
-    int NumOfShufflesPerDest = LT.first * 2 - 1;
-    int NumOfShuffles = NumOfDests * NumOfShufflesPerDest;
-
-    static const CostTblEntry AVX512VBMIShuffleTbl[] = {
-        {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b
-        {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b
-        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}  // vpermt2b
-    };
-
-    if (ST->hasVBMI())
-      if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
-                                              ISD::VECTOR_SHUFFLE, LT.second))
-        return NumOfShuffles * Entry->Cost;
-
-    static const CostTblEntry AVX512BWShuffleTbl[] = {
-        {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w
-        {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w
-        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},  // vpermt2w
-        {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3},  // zext + vpermt2w + trunc
-        {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1
-        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}   // zext + vpermt2w + trunc
-    };
-
-    if (ST->hasBWI())
-      if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
-                                              ISD::VECTOR_SHUFFLE, LT.second))
-        return NumOfShuffles * Entry->Cost;
-
-    static const CostTblEntry AVX512ShuffleTbl[] = {
-        {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1},  // vpermt2pd
-        {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps
-        {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1},  // vpermt2q
-        {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d
-        {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vpermt2pd
-        {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vpermt2ps
-        {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vpermt2q
-        {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vpermt2d
-        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // vpermt2pd
-        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},  // vpermt2ps
-        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // vpermt2q
-        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}   // vpermt2d
-    };
+  static const CostTblEntry SSSE3ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v8i16,  1 }, // pshufb
+    { TTI::SK_Broadcast, MVT::v16i8,  1 }, // pshufb
 
-    if (ST->hasAVX512())
-      if (const auto *Entry =
-              CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
-        return NumOfShuffles * Entry->Cost;
-
-  } else if (Kind == TTI::SK_PermuteSingleSrc) {
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-    if (LT.first == 1) {
-
-      static const CostTblEntry AVX512VBMIShuffleTbl[] = {
-          {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb
-          {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}  // vpermb
-      };
-
-      if (ST->hasVBMI())
-        if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
-                                                ISD::VECTOR_SHUFFLE, LT.second))
-          return Entry->Cost;
-
-      static const CostTblEntry AVX512BWShuffleTbl[] = {
-          {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw
-          {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw
-          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},  // vpermw
-          {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8},  // extend to v32i16
-          {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}   // vpermw + zext/trunc
-      };
-
-      if (ST->hasBWI())
-        if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
-                                                ISD::VECTOR_SHUFFLE, LT.second))
-          return Entry->Cost;
-
-      static const CostTblEntry AVX512ShuffleTbl[] = {
-          {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1},  // vpermpd
-          {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vpermpd
-          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // vpermpd
-          {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps
-          {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vpermps
-          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},  // vpermps
-          {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1},  // vpermq
-          {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vpermq
-          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // vpermq
-          {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd
-          {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vpermd
-          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},  // vpermd
-          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}   // pshufb
-      };
-
-      if (ST->hasAVX512())
-        if (const auto *Entry =
-            CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
-          return Entry->Cost;
-
-    } else {
-      // We are going to permute multiple sources and the result will be in
-      // multiple destinations. Providing an accurate cost only for splits where
-      // the element type remains the same.
-
-      MVT LegalVT = LT.second;
-      if (LegalVT.getVectorElementType().getSizeInBits() ==
-              Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
-          LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
-
-        unsigned VecTySize = DL.getTypeStoreSize(Tp);
-        unsigned LegalVTSize = LegalVT.getStoreSize();
-        // Number of source vectors after legalization:
-        unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
-        // Number of destination vectors after legalization:
-        unsigned NumOfDests = LT.first;
-
-        Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
-                                           LegalVT.getVectorNumElements());
-
-        unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
-        return NumOfShuffles *
-               getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
-      }
-    }
-  }
+    { TTI::SK_Reverse,   MVT::v8i16,  1 }, // pshufb
+    { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
+
+    { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pshufb + pshufb + por
+  };
+
+  if (ST->hasSSSE3())
+    if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
+
+  static const CostTblEntry SSE2ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v2f64,  1 }, // shufpd
+    { TTI::SK_Broadcast, MVT::v2i64,  1 }, // pshufd
+    { TTI::SK_Broadcast, MVT::v4i32,  1 }, // pshufd
+    { TTI::SK_Broadcast, MVT::v8i16,  2 }, // pshuflw  + pshufd
+    { TTI::SK_Broadcast, MVT::v16i8,  3 }, // unpck + pshuflw + pshufd
+
+    { TTI::SK_Reverse,   MVT::v2f64,  1 }, // shufpd
+    { TTI::SK_Reverse,   MVT::v2i64,  1 }, // pshufd
+    { TTI::SK_Reverse,   MVT::v4i32,  1 }, // pshufd
+    { TTI::SK_Reverse,   MVT::v8i16,  3 }, // pshuflw + pshufhw  + pshufd
+    { TTI::SK_Reverse,   MVT::v16i8,  9 }, // 2*pshuflw + 2*pshufhw
+                                           // + 2*pshufd + 2*unpck + packus
+
+    { TTI::SK_Alternate, MVT::v2i64,  1 }, // movsd
+    { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
+    { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
+    { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pand + pandn + por
+  };
+
+  if (ST->hasSSE2())
+    if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
+
+  static const CostTblEntry SSE1ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v4f32,  1 }, // shufps
+    { TTI::SK_Reverse,   MVT::v4f32,  1 }, // shufps
+    { TTI::SK_Alternate, MVT::v4f32,  2 }  // 2*shufps
+  };
+
+  if (ST->hasSSE1())
+    if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
 
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
@@ -1623,17 +1587,29 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
   return Cost+LT.first;
 }
 
-int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
+                                          const SCEV *Ptr) {
   // Address computations in vectorized code with non-consecutive addresses will
   // likely result in more instructions compared to scalar code where the
   // computation can more often be merged into the index mode. The resulting
   // extra micro-ops can significantly decrease throughput.
   unsigned NumVectorInstToHideOverhead = 10;
 
-  if (Ty->isVectorTy() && IsComplex)
-    return NumVectorInstToHideOverhead;
+  // Cost modeling of Strided Access Computation is hidden by the indexing
+  // modes of X86 regardless of the stride value. We dont believe that there
+  // is a difference between constant strided access in gerenal and constant
+  // strided value which is less than or equal to 64.
+  // Even in the case of (loop invariant) stride whose value is not known at
+  // compile time, the address computation will not incur more than one extra
+  // ADD instruction.
+  if (Ty->isVectorTy() && SE) {
+    if (!BaseT::isStridedAccess(Ptr))
+      return NumVectorInstToHideOverhead;
+    if (!BaseT::getConstantStrideStep(SE, Ptr))
+      return 1;
+  }
 
-  return BaseT::getAddressComputationCost(Ty, IsComplex);
+  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 }
 
 int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index f6bcb9f569e..c013805f432 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -71,7 +71,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
                             unsigned AddressSpace);
   int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
                              bool VariableMask, unsigned Alignment);
-  int getAddressComputationCost(Type *PtrTy, bool IsComplex);
+  int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
+                                const SCEV *Ptr);
 
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<Type *> Tys, FastMathFlags FMF);
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 6dd95f8dcd5..6b32f6c31f7 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -36,7 +36,10 @@
 
 using namespace llvm;
 
-STATISTIC(NumImported, "Number of functions imported");
+STATISTIC(NumImportedFunctions, "Number of functions imported");
+STATISTIC(NumImportedModules, "Number of modules imported from");
+STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
+STATISTIC(NumLiveSymbols, "Number of live symbols in index");
 
 /// Limit on instruction count of imported functions.
 static cl::opt<unsigned> ImportInstrLimit(
@@ -69,6 +72,9 @@ static cl::opt<float> ImportColdMultiplier(
 static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
                                   cl::desc("Print imported functions"));
 
+static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
+                                 cl::desc("Compute dead symbols"));
+
 // Temporary allows the function import pass to disable always linking
 // referenced discardable symbols.
 static cl::opt<bool>
@@ -105,78 +111,6 @@ static std::unique_ptr<Module> loadFile(const std::string &FileName,
 
 namespace {
 
-// Return true if the Summary describes a GlobalValue that can be externally
-// referenced, i.e. it does not need renaming (linkage is not local) or renaming
-// is possible (does not have a section for instance).
-static bool canBeExternallyReferenced(const GlobalValueSummary &Summary) {
-  if (!Summary.needsRenaming())
-    return true;
-
-  if (Summary.noRename())
-    // Can't externally reference a global that needs renaming if has a section
-    // or is referenced from inline assembly, for example.
-    return false;
-
-  return true;
-}
-
-// Return true if \p GUID describes a GlobalValue that can be externally
-// referenced, i.e. it does not need renaming (linkage is not local) or
-// renaming is possible (does not have a section for instance).
-static bool canBeExternallyReferenced(const ModuleSummaryIndex &Index,
-                                      GlobalValue::GUID GUID) {
-  auto Summaries = Index.findGlobalValueSummaryList(GUID);
-  if (Summaries == Index.end())
-    return true;
-  if (Summaries->second.size() != 1)
-    // If there are multiple globals with this GUID, then we know it is
-    // not a local symbol, and it is necessarily externally referenced.
-    return true;
-
-  // We don't need to check for the module path, because if it can't be
-  // externally referenced and we call it, it is necessarilly in the same
-  // module
-  return canBeExternallyReferenced(**Summaries->second.begin());
-}
-
-// Return true if the global described by \p Summary can be imported in another
-// module.
-static bool eligibleForImport(const ModuleSummaryIndex &Index,
-                              const GlobalValueSummary &Summary) {
-  if (!canBeExternallyReferenced(Summary))
-    // Can't import a global that needs renaming if has a section for instance.
-    // FIXME: we may be able to import it by copying it without promotion.
-    return false;
-
-  // Don't import functions that are not viable to inline.
-  if (Summary.isNotViableToInline())
-    return false;
-
-  // Check references (and potential calls) in the same module. If the current
-  // value references a global that can't be externally referenced it is not
-  // eligible for import. First check the flag set when we have possible
-  // opaque references (e.g. inline asm calls), then check the call and
-  // reference sets.
-  if (Summary.hasInlineAsmMaybeReferencingInternal())
-    return false;
-  bool AllRefsCanBeExternallyReferenced =
-      llvm::all_of(Summary.refs(), [&](const ValueInfo &VI) {
-        return canBeExternallyReferenced(Index, VI.getGUID());
-      });
-  if (!AllRefsCanBeExternallyReferenced)
-    return false;
-
-  if (auto *FuncSummary = dyn_cast<FunctionSummary>(&Summary)) {
-    bool AllCallsCanBeExternallyReferenced = llvm::all_of(
-        FuncSummary->calls(), [&](const FunctionSummary::EdgeTy &Edge) {
-          return canBeExternallyReferenced(Index, Edge.first.getGUID());
-        });
-    if (!AllCallsCanBeExternallyReferenced)
-      return false;
-  }
-  return true;
-}
-
 /// Given a list of possible callee implementation for a call site, select one
 /// that fits the \p Threshold.
 ///
@@ -214,7 +148,7 @@ selectCallee(const ModuleSummaryIndex &Index,
         if (Summary->instCount() > Threshold)
           return false;
 
-        if (!eligibleForImport(Index, *Summary))
+        if (Summary->notEligibleToImport())
           return false;
 
         return true;
@@ -346,7 +280,8 @@ static void computeImportForFunction(
 static void ComputeImportForModule(
     const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index,
     FunctionImporter::ImportMapTy &ImportList,
-    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr,
+    const DenseSet<GlobalValue::GUID> *DeadSymbols = nullptr) {
   // Worklist contains the list of function imported in this module, for which
   // we will analyse the callees and may import further down the callgraph.
   SmallVector<EdgeInfo, 128> Worklist;
@@ -354,6 +289,10 @@ static void ComputeImportForModule(
   // Populate the worklist with the import for the functions in the current
   // module
   for (auto &GVSummary : DefinedGVSummaries) {
+    if (DeadSymbols && DeadSymbols->count(GVSummary.first)) {
+      DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n");
+      continue;
+    }
     auto *Summary = GVSummary.second;
     if (auto *AS = dyn_cast<AliasSummary>(Summary))
       Summary = &AS->getAliasee();
@@ -393,14 +332,15 @@ void llvm::ComputeCrossModuleImport(
     const ModuleSummaryIndex &Index,
     const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     StringMap<FunctionImporter::ImportMapTy> &ImportLists,
-    StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
+    StringMap<FunctionImporter::ExportSetTy> &ExportLists,
+    const DenseSet<GlobalValue::GUID> *DeadSymbols) {
   // For each module that has function defined, compute the import/export lists.
   for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
     auto &ImportList = ImportLists[DefinedGVSummaries.first()];
     DEBUG(dbgs() << "Computing import for Module '"
                  << DefinedGVSummaries.first() << "'\n");
     ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList,
-                           &ExportLists);
+                           &ExportLists, DeadSymbols);
   }
 
   // When computing imports we added all GUIDs referenced by anything
@@ -462,6 +402,86 @@ void llvm::ComputeCrossModuleImportForModule(
 #endif
 }
 
+DenseSet<GlobalValue::GUID> llvm::computeDeadSymbols(
+    const ModuleSummaryIndex &Index,
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  if (!ComputeDead)
+    return DenseSet<GlobalValue::GUID>();
+  if (GUIDPreservedSymbols.empty())
+    // Don't do anything when nothing is live, this is friendly with tests.
+    return DenseSet<GlobalValue::GUID>();
+  DenseSet<GlobalValue::GUID> LiveSymbols = GUIDPreservedSymbols;
+  SmallVector<GlobalValue::GUID, 128> Worklist;
+  Worklist.reserve(LiveSymbols.size() * 2);
+  for (auto GUID : LiveSymbols) {
+    DEBUG(dbgs() << "Live root: " << GUID << "\n");
+    Worklist.push_back(GUID);
+  }
+  // Add values flagged in the index as live roots to the worklist.
+  for (const auto &Entry : Index) {
+    bool IsLiveRoot = llvm::any_of(
+        Entry.second,
+        [&](const std::unique_ptr<llvm::GlobalValueSummary> &Summary) {
+          return Summary->liveRoot();
+        });
+    if (!IsLiveRoot)
+      continue;
+    DEBUG(dbgs() << "Live root (summary): " << Entry.first << "\n");
+    Worklist.push_back(Entry.first);
+  }
+
+  while (!Worklist.empty()) {
+    auto GUID = Worklist.pop_back_val();
+    auto It = Index.findGlobalValueSummaryList(GUID);
+    if (It == Index.end()) {
+      DEBUG(dbgs() << "Not in index: " << GUID << "\n");
+      continue;
+    }
+
+    // FIXME: we should only make the prevailing copy live here
+    for (auto &Summary : It->second) {
+      for (auto Ref : Summary->refs()) {
+        auto RefGUID = Ref.getGUID();
+        if (LiveSymbols.insert(RefGUID).second) {
+          DEBUG(dbgs() << "Marking live (ref): " << RefGUID << "\n");
+          Worklist.push_back(RefGUID);
+        }
+      }
+      if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) {
+        for (auto Call : FS->calls()) {
+          auto CallGUID = Call.first.getGUID();
+          if (LiveSymbols.insert(CallGUID).second) {
+            DEBUG(dbgs() << "Marking live (call): " << CallGUID << "\n");
+            Worklist.push_back(CallGUID);
+          }
+        }
+      }
+      if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
+        auto AliaseeGUID = AS->getAliasee().getOriginalName();
+        if (LiveSymbols.insert(AliaseeGUID).second) {
+          DEBUG(dbgs() << "Marking live (alias): " << AliaseeGUID << "\n");
+          Worklist.push_back(AliaseeGUID);
+        }
+      }
+    }
+  }
+  DenseSet<GlobalValue::GUID> DeadSymbols;
+  DeadSymbols.reserve(
+      std::min(Index.size(), Index.size() - LiveSymbols.size()));
+  for (auto &Entry : Index) {
+    auto GUID = Entry.first;
+    if (!LiveSymbols.count(GUID)) {
+      DEBUG(dbgs() << "Marking dead: " << GUID << "\n");
+      DeadSymbols.insert(GUID);
+    }
+  }
+  DEBUG(dbgs() << LiveSymbols.size() << " symbols Live, and "
+               << DeadSymbols.size() << " symbols Dead \n");
+  NumDeadSymbols += DeadSymbols.size();
+  NumLiveSymbols += LiveSymbols.size();
+  return DeadSymbols;
+}
+
 /// Compute the set of summaries needed for a ThinLTO backend compilation of
 /// \p ModulePath.
 void llvm::gatherImportedSummariesForModule(
@@ -625,7 +645,6 @@ Expected<bool> FunctionImporter::importFunctions(
     // now, before linking it (otherwise this will be a noop).
     if (Error Err = SrcModule->materializeMetadata())
       return std::move(Err);
-    UpgradeDebugInfo(*SrcModule);
 
     auto &ImportGUIDs = FunctionsToImportPerModule->second;
     // Find the globals to import
@@ -698,6 +717,10 @@ Expected<bool> FunctionImporter::importFunctions(
       }
     }
 
+    // Upgrade debug info after we're done materializing all the globals and we
+    // have loaded all the required metadata!
+    UpgradeDebugInfo(*SrcModule);
+
     // Link in the specified functions.
     if (renameModuleForThinLTO(*SrcModule, Index, &GlobalsToImport))
       return true;
@@ -717,9 +740,10 @@ Expected<bool> FunctionImporter::importFunctions(
       report_fatal_error("Function Import: link error");
 
     ImportedCount += GlobalsToImport.size();
+    NumImportedModules++;
   }
 
-  NumImported += ImportedCount;
+  NumImportedFunctions += ImportedCount;
 
   DEBUG(dbgs() << "Imported " << ImportedCount << " functions for Module "
                << DestModule.getModuleIdentifier() << "\n");
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 2948878cffc..f4742aaf748 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -27,9 +27,12 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/TrailingObjects.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
@@ -52,6 +55,20 @@ static cl::opt<bool> AvoidReuse(
     cl::desc("Try to avoid reuse of byte array addresses using aliases"),
     cl::Hidden, cl::init(true));
 
+static cl::opt<std::string> ClSummaryAction(
+    "lowertypetests-summary-action",
+    cl::desc("What to do with the summary when running this pass"), cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+    "lowertypetests-read-summary",
+    cl::desc("Read summary from given YAML file before running pass"),
+    cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+    "lowertypetests-write-summary",
+    cl::desc("Write summary to given YAML file after running pass"),
+    cl::Hidden);
+
 bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
   if (Offset < ByteOffset)
     return false;
@@ -66,38 +83,6 @@ bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
   return Bits.count(BitOffset);
 }
 
-bool BitSetInfo::containsValue(
-    const DataLayout &DL,
-    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout, Value *V,
-    uint64_t COffset) const {
-  if (auto GV = dyn_cast<GlobalObject>(V)) {
-    auto I = GlobalLayout.find(GV);
-    if (I == GlobalLayout.end())
-      return false;
-    return containsGlobalOffset(I->second + COffset);
-  }
-
-  if (auto GEP = dyn_cast<GEPOperator>(V)) {
-    APInt APOffset(DL.getPointerSizeInBits(0), 0);
-    bool Result = GEP->accumulateConstantOffset(DL, APOffset);
-    if (!Result)
-      return false;
-    COffset += APOffset.getZExtValue();
-    return containsValue(DL, GlobalLayout, GEP->getPointerOperand(), COffset);
-  }
-
-  if (auto Op = dyn_cast<Operator>(V)) {
-    if (Op->getOpcode() == Instruction::BitCast)
-      return containsValue(DL, GlobalLayout, Op->getOperand(0), COffset);
-
-    if (Op->getOpcode() == Instruction::Select)
-      return containsValue(DL, GlobalLayout, Op->getOperand(1), COffset) &&
-             containsValue(DL, GlobalLayout, Op->getOperand(2), COffset);
-  }
-
-  return false;
-}
-
 void BitSetInfo::print(raw_ostream &OS) const {
   OS << "offset " << ByteOffset << " size " << BitSize << " align "
      << (1 << AlignLog2);
@@ -204,7 +189,7 @@ struct ByteArrayInfo {
   std::set<uint64_t> Bits;
   uint64_t BitSize;
   GlobalVariable *ByteArray;
-  Constant *Mask;
+  GlobalVariable *MaskGlobal;
 };
 
 /// A POD-like structure that we use to store a global reference together with
@@ -241,6 +226,9 @@ class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
 class LowerTypeTestsModule {
   Module &M;
 
+  // This is for testing purposes only.
+  std::unique_ptr<ModuleSummaryIndex> OwnedSummary;
+
   bool LinkerSubsectionsViaSymbols;
   Triple::ArchType Arch;
   Triple::OSType OS;
@@ -248,6 +236,7 @@ class LowerTypeTestsModule {
 
   IntegerType *Int1Ty = Type::getInt1Ty(M.getContext());
   IntegerType *Int8Ty = Type::getInt8Ty(M.getContext());
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
   IntegerType *Int32Ty = Type::getInt32Ty(M.getContext());
   PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty);
   IntegerType *Int64Ty = Type::getInt64Ty(M.getContext());
@@ -259,6 +248,37 @@ class LowerTypeTestsModule {
   // Mapping from type identifiers to the call sites that test them.
   DenseMap<Metadata *, std::vector<CallInst *>> TypeTestCallSites;
 
+  /// This structure describes how to lower type tests for a particular type
+  /// identifier. It is either built directly from the global analysis (during
+  /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type
+  /// identifier summaries and external symbol references (in ThinLTO backends).
+  struct TypeIdLowering {
+    TypeTestResolution::Kind TheKind;
+
+    /// All except Unsat: the start address within the combined global.
+    Constant *OffsetedGlobal;
+
+    /// ByteArray, Inline, AllOnes: log2 of the required global alignment
+    /// relative to the start address.
+    Constant *AlignLog2;
+
+    /// ByteArray, Inline, AllOnes: size of the memory region covering members
+    /// of this type identifier as a multiple of 2^AlignLog2.
+    Constant *Size;
+
+    /// ByteArray, Inline, AllOnes: range of the size expressed as a bit width.
+    unsigned SizeBitWidth;
+
+    /// ByteArray: the byte array to test the address against.
+    Constant *TheByteArray;
+
+    /// ByteArray: the bit mask to apply to bytes loaded from the byte array.
+    Constant *BitMask;
+
+    /// Inline: the bit mask to test the address against.
+    Constant *InlineBits;
+  };
+
   std::vector<ByteArrayInfo> ByteArrayInfos;
 
   Function *WeakInitializerFn = nullptr;
@@ -268,15 +288,13 @@ class LowerTypeTestsModule {
               const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
   ByteArrayInfo *createByteArray(BitSetInfo &BSI);
   void allocateByteArrays();
-  Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI,
+  Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL,
                           Value *BitOffset);
   void lowerTypeTestCalls(
       ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
       const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
-  Value *
-  lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
-                  Constant *CombinedGlobal,
-                  const DenseMap<GlobalObject *, uint64_t> &GlobalLayout);
+  Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+                           const TypeIdLowering &TIL);
   void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
                                        ArrayRef<GlobalTypeMember *> Globals);
   unsigned getJumpTableEntrySize();
@@ -302,6 +320,7 @@ class LowerTypeTestsModule {
 
 public:
   LowerTypeTestsModule(Module &M);
+  ~LowerTypeTestsModule();
   bool lower();
 };
 
@@ -380,7 +399,7 @@ ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) {
   BAI->Bits = BSI.Bits;
   BAI->BitSize = BSI.BitSize;
   BAI->ByteArray = ByteArrayGlobal;
-  BAI->Mask = ConstantExpr::getPtrToInt(MaskGlobal, Int8Ty);
+  BAI->MaskGlobal = MaskGlobal;
   return BAI;
 }
 
@@ -399,8 +418,9 @@ void LowerTypeTestsModule::allocateByteArrays() {
     uint8_t Mask;
     BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask);
 
-    BAI->Mask->replaceAllUsesWith(ConstantInt::get(Int8Ty, Mask));
-    cast<GlobalVariable>(BAI->Mask->getOperand(0))->eraseFromParent();
+    BAI->MaskGlobal->replaceAllUsesWith(
+        ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy));
+    BAI->MaskGlobal->eraseFromParent();
   }
 
   Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes);
@@ -435,101 +455,121 @@ void LowerTypeTestsModule::allocateByteArrays() {
   ByteArraySizeBytes = BAB.Bytes.size();
 }
 
-/// Build a test that bit BitOffset is set in BSI, where
-/// BitSetGlobal is a global containing the bits in BSI.
-Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,
-                                              ByteArrayInfo *&BAI,
+/// Build a test that bit BitOffset is set in the type identifier that was
+/// lowered to TIL, which must be either an Inline or a ByteArray.
+Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B,
+                                              const TypeIdLowering &TIL,
                                               Value *BitOffset) {
-  if (BSI.BitSize <= 64) {
+  if (TIL.TheKind == TypeTestResolution::Inline) {
     // If the bit set is sufficiently small, we can avoid a load by bit testing
     // a constant.
-    IntegerType *BitsTy;
-    if (BSI.BitSize <= 32)
-      BitsTy = Int32Ty;
-    else
-      BitsTy = Int64Ty;
-
-    uint64_t Bits = 0;
-    for (auto Bit : BSI.Bits)
-      Bits |= uint64_t(1) << Bit;
-    Constant *BitsConst = ConstantInt::get(BitsTy, Bits);
-    return createMaskedBitTest(B, BitsConst, BitOffset);
+    return createMaskedBitTest(B, TIL.InlineBits, BitOffset);
   } else {
-    if (!BAI) {
-      ++NumByteArraysCreated;
-      BAI = createByteArray(BSI);
-    }
-
-    Constant *ByteArray = BAI->ByteArray;
-    Type *Ty = BAI->ByteArray->getValueType();
+    Constant *ByteArray = TIL.TheByteArray;
     if (!LinkerSubsectionsViaSymbols && AvoidReuse) {
       // Each use of the byte array uses a different alias. This makes the
       // backend less likely to reuse previously computed byte array addresses,
       // improving the security of the CFI mechanism based on this pass.
-      ByteArray = GlobalAlias::create(BAI->ByteArray->getValueType(), 0,
-                                      GlobalValue::PrivateLinkage, "bits_use",
-                                      ByteArray, &M);
+      ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage,
+                                      "bits_use", ByteArray, &M);
     }
 
-    Value *ByteAddr = B.CreateGEP(Ty, ByteArray, BitOffset);
+    Value *ByteAddr = B.CreateGEP(Int8Ty, ByteArray, BitOffset);
     Value *Byte = B.CreateLoad(ByteAddr);
 
-    Value *ByteAndMask = B.CreateAnd(Byte, BAI->Mask);
+    Value *ByteAndMask =
+        B.CreateAnd(Byte, ConstantExpr::getPtrToInt(TIL.BitMask, Int8Ty));
     return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0));
   }
 }
 
+static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL,
+                                Value *V, uint64_t COffset) {
+  if (auto GV = dyn_cast<GlobalObject>(V)) {
+    SmallVector<MDNode *, 2> Types;
+    GV->getMetadata(LLVMContext::MD_type, Types);
+    for (MDNode *Type : Types) {
+      if (Type->getOperand(1) != TypeId)
+        continue;
+      uint64_t Offset =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+              ->getZExtValue();
+      if (COffset == Offset)
+        return true;
+    }
+    return false;
+  }
+
+  if (auto GEP = dyn_cast<GEPOperator>(V)) {
+    APInt APOffset(DL.getPointerSizeInBits(0), 0);
+    bool Result = GEP->accumulateConstantOffset(DL, APOffset);
+    if (!Result)
+      return false;
+    COffset += APOffset.getZExtValue();
+    return isKnownTypeIdMember(TypeId, DL, GEP->getPointerOperand(), COffset);
+  }
+
+  if (auto Op = dyn_cast<Operator>(V)) {
+    if (Op->getOpcode() == Instruction::BitCast)
+      return isKnownTypeIdMember(TypeId, DL, Op->getOperand(0), COffset);
+
+    if (Op->getOpcode() == Instruction::Select)
+      return isKnownTypeIdMember(TypeId, DL, Op->getOperand(1), COffset) &&
+             isKnownTypeIdMember(TypeId, DL, Op->getOperand(2), COffset);
+  }
+
+  return false;
+}
+
 /// Lower a llvm.type.test call to its implementation. Returns the value to
 /// replace the call with.
-Value *LowerTypeTestsModule::lowerBitSetCall(
-    CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
-    Constant *CombinedGlobalIntAddr,
-    const DenseMap<GlobalObject *, uint64_t> &GlobalLayout) {
+Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+                                               const TypeIdLowering &TIL) {
+  if (TIL.TheKind == TypeTestResolution::Unsat)
+    return ConstantInt::getFalse(M.getContext());
+
   Value *Ptr = CI->getArgOperand(0);
   const DataLayout &DL = M.getDataLayout();
-
-  if (BSI.containsValue(DL, GlobalLayout, Ptr))
+  if (isKnownTypeIdMember(TypeId, DL, Ptr, 0))
     return ConstantInt::getTrue(M.getContext());
 
-  Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd(
-      CombinedGlobalIntAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset));
-
   BasicBlock *InitialBB = CI->getParent();
 
   IRBuilder<> B(CI);
 
   Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy);
 
-  if (BSI.isSingleOffset())
+  Constant *OffsetedGlobalAsInt =
+      ConstantExpr::getPtrToInt(TIL.OffsetedGlobal, IntPtrTy);
+  if (TIL.TheKind == TypeTestResolution::Single)
     return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt);
 
   Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt);
 
-  Value *BitOffset;
-  if (BSI.AlignLog2 == 0) {
-    BitOffset = PtrOffset;
-  } else {
-    // We need to check that the offset both falls within our range and is
-    // suitably aligned. We can check both properties at the same time by
-    // performing a right rotate by log2(alignment) followed by an integer
-    // comparison against the bitset size. The rotate will move the lower
-    // order bits that need to be zero into the higher order bits of the
-    // result, causing the comparison to fail if they are nonzero. The rotate
-    // also conveniently gives us a bit offset to use during the load from
-    // the bitset.
-    Value *OffsetSHR =
-        B.CreateLShr(PtrOffset, ConstantInt::get(IntPtrTy, BSI.AlignLog2));
-    Value *OffsetSHL = B.CreateShl(
-        PtrOffset,
-        ConstantInt::get(IntPtrTy, DL.getPointerSizeInBits(0) - BSI.AlignLog2));
-    BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
-  }
-
-  Constant *BitSizeConst = ConstantInt::get(IntPtrTy, BSI.BitSize);
+  // We need to check that the offset both falls within our range and is
+  // suitably aligned. We can check both properties at the same time by
+  // performing a right rotate by log2(alignment) followed by an integer
+  // comparison against the bitset size. The rotate will move the lower
+  // order bits that need to be zero into the higher order bits of the
+  // result, causing the comparison to fail if they are nonzero. The rotate
+  // also conveniently gives us a bit offset to use during the load from
+  // the bitset.
+  Value *OffsetSHR =
+      B.CreateLShr(PtrOffset, ConstantExpr::getZExt(TIL.AlignLog2, IntPtrTy));
+  Value *OffsetSHL = B.CreateShl(
+      PtrOffset, ConstantExpr::getZExt(
+                     ConstantExpr::getSub(
+                         ConstantInt::get(Int8Ty, DL.getPointerSizeInBits(0)),
+                         TIL.AlignLog2),
+                     IntPtrTy));
+  Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
+
+  Constant *BitSizeConst = ConstantExpr::getZExt(TIL.Size, IntPtrTy);
   Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst);
 
   // If the bit set is all ones, testing against it is unnecessary.
-  if (BSI.isAllOnes())
+  if (TIL.TheKind == TypeTestResolution::AllOnes)
     return OffsetInRange;
 
   TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false);
@@ -537,7 +577,7 @@ Value *LowerTypeTestsModule::lowerBitSetCall(
 
   // Now that we know that the offset is in range and aligned, load the
   // appropriate bit from the bitset.
-  Value *Bit = createBitSetTest(ThenB, BSI, BAI, BitOffset);
+  Value *Bit = createBitSetTest(ThenB, TIL, BitOffset);
 
   // The value we want is 0 if we came directly from the initial block
   // (having failed the range or alignment checks), or the loaded bit if
@@ -622,11 +662,7 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
 void LowerTypeTestsModule::lowerTypeTestCalls(
     ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
     const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
-  Constant *CombinedGlobalIntAddr =
-      ConstantExpr::getPtrToInt(CombinedGlobalAddr, IntPtrTy);
-  DenseMap<GlobalObject *, uint64_t> GlobalObjLayout;
-  for (auto &P : GlobalLayout)
-    GlobalObjLayout[P.first->getGlobal()] = P.second;
+  CombinedGlobalAddr = ConstantExpr::getBitCast(CombinedGlobalAddr, Int8PtrTy);
 
   // For each type identifier in this disjoint set...
   for (Metadata *TypeId : TypeIds) {
@@ -640,13 +676,43 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
       BSI.print(dbgs());
     });
 
-    ByteArrayInfo *BAI = nullptr;
+    TypeIdLowering TIL;
+    TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr(
+        Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)),
+    TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2);
+    if (BSI.isAllOnes()) {
+      TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
+                                       : TypeTestResolution::AllOnes;
+      TIL.SizeBitWidth = (BSI.BitSize <= 256) ? 8 : 32;
+      TIL.Size = ConstantInt::get((BSI.BitSize <= 256) ? Int8Ty : Int32Ty,
+                                  BSI.BitSize);
+    } else if (BSI.BitSize <= 64) {
+      TIL.TheKind = TypeTestResolution::Inline;
+      TIL.SizeBitWidth = (BSI.BitSize <= 32) ? 5 : 6;
+      TIL.Size = ConstantInt::get(Int8Ty, BSI.BitSize);
+      uint64_t InlineBits = 0;
+      for (auto Bit : BSI.Bits)
+        InlineBits |= uint64_t(1) << Bit;
+      if (InlineBits == 0)
+        TIL.TheKind = TypeTestResolution::Unsat;
+      else
+        TIL.InlineBits = ConstantInt::get(
+            (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
+    } else {
+      TIL.TheKind = TypeTestResolution::ByteArray;
+      TIL.SizeBitWidth = (BSI.BitSize <= 256) ? 8 : 32;
+      TIL.Size = ConstantInt::get((BSI.BitSize <= 256) ? Int8Ty : Int32Ty,
+                                  BSI.BitSize);
+      ++NumByteArraysCreated;
+      ByteArrayInfo *BAI = createByteArray(BSI);
+      TIL.TheByteArray = BAI->ByteArray;
+      TIL.BitMask = BAI->MaskGlobal;
+    }
 
     // Lower each call to llvm.type.test for this type identifier.
     for (CallInst *CI : TypeTestCallSites[TypeId]) {
       ++NumTypeTestCallsLowered;
-      Value *Lowered =
-          lowerBitSetCall(CI, BSI, BAI, CombinedGlobalIntAddr, GlobalObjLayout);
+      Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
       CI->replaceAllUsesWith(Lowered);
       CI->eraseFromParent();
     }
@@ -1080,6 +1146,22 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
 
 /// Lower all type tests in this module.
 LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) {
+  // Handle the command-line summary arguments. This code is for testing
+  // purposes only, so we handle errors directly.
+  if (!ClSummaryAction.empty()) {
+    OwnedSummary = make_unique<ModuleSummaryIndex>();
+    if (!ClReadSummary.empty()) {
+      ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
+                            ": ");
+      auto ReadSummaryFile =
+          ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+      yaml::Input In(ReadSummaryFile->getBuffer());
+      In >> *OwnedSummary;
+      ExitOnErr(errorCodeToError(In.error()));
+    }
+  }
+
   Triple TargetTriple(M.getTargetTriple());
   LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
   Arch = TargetTriple.getArch();
@@ -1087,6 +1169,20 @@ LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) {
   ObjectFormat = TargetTriple.getObjectFormat();
 }
 
+LowerTypeTestsModule::~LowerTypeTestsModule() {
+  if (ClSummaryAction.empty() || ClWriteSummary.empty())
+    return;
+
+  ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
+                        ": ");
+  std::error_code EC;
+  raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+  ExitOnErr(errorCodeToError(EC));
+
+  yaml::Output Out(OS);
+  Out << *OwnedSummary;
+}
+
 bool LowerTypeTestsModule::lower() {
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f863d192fc2..b29ed3c8745 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1637,6 +1637,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::cos:
+  case Intrinsic::amdgcn_cos: {
+    Value *SrcSrc;
+    Value *Src = II->getArgOperand(0);
+    if (match(Src, m_FNeg(m_Value(SrcSrc))) ||
+        match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) {
+      // cos(-x) -> cos(x)
+      // cos(fabs(x)) -> cos(x)
+      II->setArgOperand(0, SrcSrc);
+      return II;
+    }
+
+    break;
+  }
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
     // Turn PPC lvx -> load if the pointer is known aligned.
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 6a7cb0e45c6..1d552839877 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -514,7 +514,8 @@ struct AddressSanitizer : public FunctionPass {
   void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
                          Value *Addr, uint32_t TypeSize, bool IsWrite,
                          Value *SizeArgument, bool UseCalls, uint32_t Exp);
-  void instrumentUnusualSizeOrAlignment(Instruction *I, Value *Addr,
+  void instrumentUnusualSizeOrAlignment(Instruction *I,
+                                        Instruction *InsertBefore, Value *Addr,
                                         uint32_t TypeSize, bool IsWrite,
                                         Value *SizeArgument, bool UseCalls,
                                         uint32_t Exp);
@@ -1056,20 +1057,18 @@ Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
           return nullptr;
         *IsWrite = false;
       }
-      // Only instrument if the mask is constant for now.
-      if (isa<ConstantVector>(CI->getOperand(2 + OpOffset))) {
-        auto BasePtr = CI->getOperand(0 + OpOffset);
-        auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
-        *TypeSize = DL.getTypeStoreSizeInBits(Ty);
-        if (auto AlignmentConstant =
-                dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
-          *Alignment = (unsigned)AlignmentConstant->getZExtValue();
-        else
-          *Alignment = 1; // No alignment guarantees. We probably got Undef
-        if (MaybeMask)
-          *MaybeMask = CI->getOperand(2 + OpOffset);
-        PtrOperand = BasePtr;
-      }
+
+      auto BasePtr = CI->getOperand(0 + OpOffset);
+      auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+      *TypeSize = DL.getTypeStoreSizeInBits(Ty);
+      if (auto AlignmentConstant =
+              dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        *Alignment = (unsigned)AlignmentConstant->getZExtValue();
+      else
+        *Alignment = 1; // No alignment guarantees. We probably got Undef
+      if (MaybeMask)
+        *MaybeMask = CI->getOperand(2 + OpOffset);
+      PtrOperand = BasePtr;
     }
   }
 
@@ -1130,24 +1129,25 @@ void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
 }
 
 static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
-                                Value *Addr, unsigned Alignment,
-                                unsigned Granularity, uint32_t TypeSize,
-                                bool IsWrite, Value *SizeArgument,
-                                bool UseCalls, uint32_t Exp) {
+                                Instruction *InsertBefore, Value *Addr,
+                                unsigned Alignment, unsigned Granularity,
+                                uint32_t TypeSize, bool IsWrite,
+                                Value *SizeArgument, bool UseCalls,
+                                uint32_t Exp) {
   // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
   // if the data is properly aligned.
   if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
        TypeSize == 128) &&
       (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
-    return Pass->instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr,
-                                   UseCalls, Exp);
-  Pass->instrumentUnusualSizeOrAlignment(I, Addr, TypeSize, IsWrite, nullptr,
-                                         UseCalls, Exp);
+    return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite,
+                                   nullptr, UseCalls, Exp);
+  Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize,
+                                         IsWrite, nullptr, UseCalls, Exp);
 }
 
 static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
                                         const DataLayout &DL, Type *IntptrTy,
-                                        ConstantVector *Mask, Instruction *I,
+                                        Value *Mask, Instruction *I,
                                         Value *Addr, unsigned Alignment,
                                         unsigned Granularity, uint32_t TypeSize,
                                         bool IsWrite, Value *SizeArgument,
@@ -1157,15 +1157,30 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
   unsigned Num = VTy->getVectorNumElements();
   auto Zero = ConstantInt::get(IntptrTy, 0);
   for (unsigned Idx = 0; Idx < Num; ++Idx) {
-    // dyn_cast as we might get UndefValue
-    auto Masked = dyn_cast<ConstantInt>(Mask->getOperand(Idx));
-    if (Masked && Masked->isAllOnesValue()) {
+    Value *InstrumentedAddress = nullptr;
+    Instruction *InsertBefore = I;
+    if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+      // dyn_cast as we might get UndefValue
+      if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+        if (Masked->isNullValue())
+          // Mask is constant false, so no instrumentation needed.
+          continue;
+        // If we have a true or undef value, fall through to doInstrumentAddress
+        // with InsertBefore == I
+      }
+    } else {
       IRBuilder<> IRB(I);
-      auto InstrumentedAddress =
-          IRB.CreateGEP(Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
-      doInstrumentAddress(Pass, I, InstrumentedAddress, Alignment, Granularity,
-                          ElemTypeSize, IsWrite, SizeArgument, UseCalls, Exp);
+      Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+      TerminatorInst *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      InsertBefore = ThenTerm;
     }
+
+    IRBuilder<> IRB(InsertBefore);
+    InstrumentedAddress =
+        IRB.CreateGEP(Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+    doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment,
+                        Granularity, ElemTypeSize, IsWrite, SizeArgument,
+                        UseCalls, Exp);
   }
 }
 
@@ -1220,12 +1235,11 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
 
   unsigned Granularity = 1 << Mapping.Scale;
   if (MaybeMask) {
-    auto Mask = cast<ConstantVector>(MaybeMask);
-    instrumentMaskedLoadOrStore(this, DL, IntptrTy, Mask, I, Addr, Alignment,
-                                Granularity, TypeSize, IsWrite, nullptr,
-                                UseCalls, Exp);
+    instrumentMaskedLoadOrStore(this, DL, IntptrTy, MaybeMask, I, Addr,
+                                Alignment, Granularity, TypeSize, IsWrite,
+                                nullptr, UseCalls, Exp);
   } else {
-    doInstrumentAddress(this, I, Addr, Alignment, Granularity, TypeSize,
+    doInstrumentAddress(this, I, I, Addr, Alignment, Granularity, TypeSize,
                         IsWrite, nullptr, UseCalls, Exp);
   }
 }
@@ -1342,9 +1356,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
 // to report the actual access size.
 void AddressSanitizer::instrumentUnusualSizeOrAlignment(
-    Instruction *I, Value *Addr, uint32_t TypeSize, bool IsWrite,
-    Value *SizeArgument, bool UseCalls, uint32_t Exp) {
-  IRBuilder<> IRB(I);
+    Instruction *I, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize,
+    bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
   Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
   if (UseCalls) {
@@ -1358,8 +1372,8 @@ void AddressSanitizer::instrumentUnusualSizeOrAlignment(
     Value *LastByte = IRB.CreateIntToPtr(
         IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
         Addr->getType());
-    instrumentAddress(I, I, Addr, 8, IsWrite, Size, false, Exp);
-    instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false, Exp);
+    instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp);
+    instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp);
   }
 }
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 9485bfd7c29..0137378b828 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -1572,6 +1572,13 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
   // Assign value numbers to the new instructions.
   for (Instruction *I : NewInsts) {
+    // Instructions that have been inserted in predecessor(s) to materialize
+    // the load address do not retain their original debug locations. Doing
+    // so could lead to confusing (but correct) source attributions.
+    // FIXME: How do we retain source locations without causing poor debugging
+    // behavior?
+    I->setDebugLoc(DebugLoc());
+
     // FIXME: We really _ought_ to insert these value numbers into their
     // parent's availability map.  However, in doing so, we risk getting into
     // ordering issues.  If a block hasn't been processed yet, we would be
@@ -1601,8 +1608,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range))
       NewLoad->setMetadata(LLVMContext::MD_range, RangeMD);
 
-    // Transfer DebugLoc.
-    NewLoad->setDebugLoc(LI->getDebugLoc());
+    // We do not propagate the old load's debug location, because the new
+    // load now lives in a different BB, and we want to avoid a jumpy line
+    // table.
+    // FIXME: How do we retain source locations without causing poor debugging
+    // behavior?
 
     // Add the newly created load.
     ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 1cc5c8f0da8..6ef9d056132 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -408,6 +408,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
           CurAST->deleteValue(&I);
           I.eraseFromParent();
         }
+        Changed = true;
         continue;
       }
 
@@ -766,6 +767,14 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
   // Move the new node to the Preheader, before its terminator.
   I.moveBefore(Preheader->getTerminator());
 
+  // Do not retain debug locations when we are moving instructions to different
+  // basic blocks, because we want to avoid jumpy line tables. Calls, however,
+  // need to retain their debug locs because they may be inlined.
+  // FIXME: How do we retain source locations without causing poor debugging
+  // behavior?
+  if (!isa<CallInst>(I))
+    I.setDebugLoc(DebugLoc());
+
   if (isa<LoadInst>(I))
     ++NumMovedLoads;
   else if (isa<CallInst>(I))
@@ -911,14 +920,23 @@ bool llvm::promoteLoopAccessesToScalars(
   //
   // If at least one store is guaranteed to execute, both properties are
   // satisfied, and promotion is legal.
+  //
   // This, however, is not a necessary condition. Even if no store/load is
-  // guaranteed to execute, we can still establish these properties:
-  // (p1) by proving that hoisting the load into the preheader is
-  // safe (i.e. proving dereferenceability on all paths through the loop). We
+  // guaranteed to execute, we can still establish these properties.
+  // We can establish (p1) by proving that hoisting the load into the preheader
+  // is safe (i.e. proving dereferenceability on all paths through the loop). We
   // can use any access within the alias set to prove dereferenceability,
   // since they're all must alias.
-  // (p2) by proving the memory is thread-local, so the memory model
+  // 
+  // There are two ways establish (p2): 
+  // a) Prove the location is thread-local. In this case the memory model
   // requirement does not apply, and stores are safe to insert.
+  // b) Prove a store dominates every exit block. In this case, if an exit
+  // blocks is reached, the original dynamic path would have taken us through
+  // the store, so inserting a store into the exit block is safe. Note that this
+  // is different from the store being guaranteed to execute. For instance,
+  // if an exception is thrown on the first iteration of the loop, the original
+  // store is never executed, but the exit blocks are not executed either.
 
   bool DereferenceableInPH = false;
   bool SafeToInsertStore = false;
@@ -1000,6 +1018,17 @@ bool llvm::promoteLoopAccessesToScalars(
           }
         }
 
+        // If a store dominates all exit blocks, it is safe to sink.
+        // As explained above, if an exit block was executed, a dominating
+        // store must have been been executed at least once, so we are not
+        // introducing stores on paths that did not have them.
+        // Note that this only looks at explicit exit blocks. If we ever
+        // start sinking stores into unwind edges (see above), this will break.
+        if (!SafeToInsertStore)
+          SafeToInsertStore = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) {
+            return DT->dominates(Store->getParent(), Exit);
+          });
+
         // If the store is not guaranteed to execute, we may still get
         // deref info through it.
         if (!DereferenceableInPH) {
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index fd167db1178..2743574ecca 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -997,7 +997,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
 /// Check if the given conditional branch is based on the comparison between
 /// a variable and zero, and if the variable is non-zero, the control yields to
 /// the loop entry. If the branch matches the behavior, the variable involved
-/// in the comparion is returned. This function will be called to see if the
+/// in the comparison is returned. This function will be called to see if the
 /// precondition and postcondition of the loop are in desirable form.
 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
   if (!BI || !BI->isConditional())
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index 90309d7ebba..f6435449777 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -283,8 +283,7 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   // sinked.
   for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
     Instruction *I = &*II++;
-    if (!L.hasLoopInvariantOperands(I) ||
-        !canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
+    if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
       continue;
     if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
       Changed = true;
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index 440e36767ed..678d02e05d4 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -56,12 +56,9 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
   if (!isPerformingImport() && !isModuleExporting())
     return false;
 
-  // If we are exporting, we need to see whether this value is marked
-  // as NoRename in the summary. If we are importing, we may not have
-  // a summary in the distributed backend case (only summaries for values
-  // importes as defs, not references, are included in the index passed
-  // to the distributed backends).
   if (isPerformingImport()) {
+    assert((!GlobalsToImport->count(SGV) || !isNonRenamableLocal(*SGV)) &&
+           "Attempting to promote non-renamable local");
     // We don't know for sure yet if we are importing this value (as either
     // a reference or a def), since we are simply walking all values in the
     // module. But by necessity if we end up importing it and it is local,
@@ -77,13 +74,28 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
   assert(Summaries->second.size() == 1 && "Local has more than one summary");
   auto Linkage = Summaries->second.front()->linkage();
   if (!GlobalValue::isLocalLinkage(Linkage)) {
-    assert(!Summaries->second.front()->noRename());
+    assert(!isNonRenamableLocal(*SGV) &&
+           "Attempting to promote non-renamable local");
     return true;
   }
 
   return false;
 }
 
+#ifndef NDEBUG
+bool FunctionImportGlobalProcessing::isNonRenamableLocal(
+    const GlobalValue &GV) const {
+  if (!GV.hasLocalLinkage())
+    return false;
+  // This needs to stay in sync with the logic in buildModuleSummaryIndex.
+  if (GV.hasSection())
+    return true;
+  if (Used.count(const_cast<GlobalValue *>(&GV)))
+    return true;
+  return false;
+}
+#endif
+
 std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV,
                                                     bool DoPromote) {
   // For locals that must be promoted to global scope, ensure that
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8cde0c4cd60..31daba2248a 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6785,22 +6785,19 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
   return Cost;
 }
 
-/// \brief Check whether the address computation for a non-consecutive memory
-/// access looks like an unlikely candidate for being merged into the indexing
-/// mode.
+/// \brief Gets Address Access SCEV after verifying that the access pattern
+/// is loop invariant except the induction variable dependence.
 ///
-/// We look for a GEP which has one index that is an induction variable and all
-/// other indices are loop invariant. If the stride of this access is also
-/// within a small bound we decide that this address computation can likely be
-/// merged into the addressing mode.
-/// In all other cases, we identify the address computation as complex.
-static bool isLikelyComplexAddressComputation(Value *Ptr,
-                                              LoopVectorizationLegality *Legal,
-                                              ScalarEvolution *SE,
-                                              const Loop *TheLoop) {
+/// This SCEV can be sent to the Target in order to estimate the address
+/// calculation cost.
+static const SCEV *getAddressAccessSCEV(
+              Value *Ptr,
+              LoopVectorizationLegality *Legal,
+              ScalarEvolution *SE,
+              const Loop *TheLoop) {
   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
   if (!Gep)
-    return true;
+    return nullptr;
 
   // We are looking for a gep with all loop invariant indices except for one
   // which should be an induction variable.
@@ -6809,33 +6806,11 @@ static bool isLikelyComplexAddressComputation(Value *Ptr,
     Value *Opd = Gep->getOperand(i);
     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
         !Legal->isInductionVariable(Opd))
-      return true;
+      return nullptr;
   }
 
-  // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
-  // can likely be merged into the address computation.
-  unsigned MaxMergeDistance = 64;
-
-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
-  if (!AddRec)
-    return true;
-
-  // Check the step is constant.
-  const SCEV *Step = AddRec->getStepRecurrence(*SE);
-  // Calculate the pointer stride and check if it is consecutive.
-  const auto *C = dyn_cast<SCEVConstant>(Step);
-  if (!C)
-    return true;
-
-  const APInt &APStepVal = C->getAPInt();
-
-  // Huge step value - give up.
-  if (APStepVal.getBitWidth() > 64)
-    return true;
-
-  int64_t StepVal = APStepVal.getSExtValue();
-
-  return StepVal > MaxMergeDistance;
+  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
+  return SE->getSCEV(Ptr);
 }
 
 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
@@ -7063,12 +7038,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       unsigned Cost = 0;
       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
 
-      // True if the memory instruction's address computation is complex.
-      bool IsComplexComputation =
-          isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
+      // Figure out whether the access is strided and get the stride value
+      // if it's known in compile time
+      const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); 
 
       // Get the cost of the scalar memory instruction and address computation.
-      Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
+      Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
       Cost += VF *
               TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
                                   Alignment, AS);
diff --git a/test/Analysis/CostModel/AArch64/bswap.ll b/test/Analysis/CostModel/AArch64/bswap.ll
new file mode 100644
index 00000000000..a97127a631d
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/bswap.ll
@@ -0,0 +1,70 @@
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu < %s | FileCheck %s
+
+; Verify the cost of bswap instructions.
+
+declare i16 @llvm.bswap.i16(i16)
+declare i32 @llvm.bswap.i32(i32)
+declare i64 @llvm.bswap.i64(i64)
+
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
+declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
+
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
+declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
+
+define i16 @bswap_i16(i16 %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_i16':
+; CHECK: Found an estimated cost of 1 for instruction:   %bswap
+  %bswap = tail call i16 @llvm.bswap.i16(i16 %a)
+  ret i16 %bswap
+}
+
+define i32 @bswap_i32(i32 %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_i32':
+; CHECK: Found an estimated cost of 1 for instruction:   %bswap
+  %bswap = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %bswap
+}
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_i64':
+; CHECK: Found an estimated cost of 1 for instruction:   %bswap
+  %bswap = tail call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %bswap
+}
+
+define <2 x i32> @bswap_v2i32(<2 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_v2i32':
+; CHECK: Found an estimated cost of 8 for instruction:   %bswap
+  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+  ret <2 x i32> %bswap
+}
+
+define <4 x i16> @bswap_v4i16(<4 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_v4i16':
+; CHECK: Found an estimated cost of 22 for instruction:   %bswap
+  %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %a)
+  ret <4 x i16> %bswap
+}
+
+define <2 x i64> @bswap_v2i64(<2 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_v2i64':
+; CHECK: Found an estimated cost of 8 for instruction:   %bswap
+  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a)
+  ret <2 x i64> %bswap
+}
+
+define <4 x i32> @bswap_v4i32(<4 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_v4i32':
+; CHECK: Found an estimated cost of 22 for instruction:   %bswap
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a)
+  ret <4 x i32> %bswap
+}
+
+define <8 x i16> @bswap_v8i16(<8 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'bswap_v8i16':
+; CHECK: Found an estimated cost of 50 for instruction:   %bswap
+  %bswap = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a)
+  ret <8 x i16> %bswap
+}
diff --git a/test/Analysis/CostModel/AArch64/falkor.ll b/test/Analysis/CostModel/AArch64/falkor.ll
new file mode 100644
index 00000000000..e9563191f07
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/falkor.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -cost-model -analyze -mcpu=falkor | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: vectorInstrCost
+define void @vectorInstrCost() {
+
+    ; Vector extracts - extracting the first element should have a zero cost;
+    ; all other elements should have a cost of two.
+    ;
+    ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
+    ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
+    %t1 = extractelement <2 x i64> undef, i32 0
+    %t2 = extractelement <2 x i64> undef, i32 1
+
+    ; Vector inserts - inserting the first element should have a zero cost; all
+    ; other elements should have a cost of two.
+    ;
+    ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
+    ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1
+    %t3 = insertelement <2 x i64> undef, i64 undef, i32 0
+    %t4 = insertelement <2 x i64> undef, i64 undef, i32 1
+
+    ret void
+}
diff --git a/test/Analysis/CostModel/AArch64/gep.ll b/test/Analysis/CostModel/AArch64/gep.ll
index f3d83c13302..08bfc3d2123 100644
--- a/test/Analysis/CostModel/AArch64/gep.ll
+++ b/test/Analysis/CostModel/AArch64/gep.ll
@@ -1,9 +1,9 @@
-; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck %s
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
 
-define i8 @test1(i8* %p, i32 %i) {
+define i8 @test1(i8* %p) {
 ; CHECK-LABEL: test1
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 0
@@ -11,7 +11,7 @@ define i8 @test1(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test2(i16* %p, i32 %i) {
+define i16 @test2(i16* %p) {
 ; CHECK-LABEL: test2
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 0
@@ -19,7 +19,7 @@ define i16 @test2(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test3(i32* %p, i32 %i) {
+define i32 @test3(i32* %p) {
 ; CHECK-LABEL: test3
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 0
@@ -27,7 +27,7 @@ define i32 @test3(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test4(i64* %p, i32 %i) {
+define i64 @test4(i64* %p) {
 ; CHECK-LABEL: test4
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 0
@@ -35,7 +35,7 @@ define i64 @test4(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test5(i8* %p, i32 %i) {
+define i8 @test5(i8* %p) {
 ; CHECK-LABEL: test5
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 1024
@@ -43,7 +43,7 @@ define i8 @test5(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test6(i16* %p, i32 %i) {
+define i16 @test6(i16* %p) {
 ; CHECK-LABEL: test6
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 1024
@@ -51,7 +51,7 @@ define i16 @test6(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test7(i32* %p, i32 %i) {
+define i32 @test7(i32* %p) {
 ; CHECK-LABEL: test7
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 1024
@@ -59,7 +59,7 @@ define i32 @test7(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test8(i64* %p, i32 %i) {
+define i64 @test8(i64* %p) {
 ; CHECK-LABEL: test8
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 1024
@@ -67,7 +67,7 @@ define i64 @test8(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test9(i8* %p, i32 %i) {
+define i8 @test9(i8* %p) {
 ; CHECK-LABEL: test9
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 4096
@@ -75,7 +75,7 @@ define i8 @test9(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test10(i16* %p, i32 %i) {
+define i16 @test10(i16* %p) {
 ; CHECK-LABEL: test10
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 4096
@@ -83,7 +83,7 @@ define i16 @test10(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test11(i32* %p, i32 %i) {
+define i32 @test11(i32* %p) {
 ; CHECK-LABEL: test11
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 4096
@@ -91,7 +91,7 @@ define i32 @test11(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test12(i64* %p, i32 %i) {
+define i64 @test12(i64* %p) {
 ; CHECK-LABEL: test12
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 4096
@@ -99,7 +99,7 @@ define i64 @test12(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test13(i8* %p, i32 %i) {
+define i8 @test13(i8* %p) {
 ; CHECK-LABEL: test13
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 -64
@@ -107,7 +107,7 @@ define i8 @test13(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test14(i16* %p, i32 %i) {
+define i16 @test14(i16* %p) {
 ; CHECK-LABEL: test14
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 -64
@@ -115,7 +115,7 @@ define i16 @test14(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test15(i32* %p, i32 %i) {
+define i32 @test15(i32* %p) {
 ; CHECK-LABEL: test15
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 -64
@@ -123,7 +123,7 @@ define i32 @test15(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test16(i64* %p, i32 %i) {
+define i64 @test16(i64* %p) {
 ; CHECK-LABEL: test16
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -64
@@ -131,7 +131,7 @@ define i64 @test16(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test17(i8* %p, i32 %i) {
+define i8 @test17(i8* %p) {
 ; CHECK-LABEL: test17
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 -1024
@@ -139,7 +139,7 @@ define i8 @test17(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test18(i16* %p, i32 %i) {
+define i16 @test18(i16* %p) {
 ; CHECK-LABEL: test18
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 -1024
@@ -147,7 +147,7 @@ define i16 @test18(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test19(i32* %p, i32 %i) {
+define i32 @test19(i32* %p) {
 ; CHECK-LABEL: test19
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 -1024
@@ -155,7 +155,7 @@ define i32 @test19(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test20(i64* %p, i32 %i) {
+define i64 @test20(i64* %p) {
 ; CHECK-LABEL: test20
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -1024
@@ -195,7 +195,7 @@ define i64 @test24(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test25(i8* %p, i32 %i) {
+define i8 @test25(i8* %p) {
 ; CHECK-LABEL: test25
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 -128
@@ -203,7 +203,7 @@ define i8 @test25(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test26(i16* %p, i32 %i) {
+define i16 @test26(i16* %p) {
 ; CHECK-LABEL: test26
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 -128
@@ -211,7 +211,7 @@ define i16 @test26(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test27(i32* %p, i32 %i) {
+define i32 @test27(i32* %p) {
 ; CHECK-LABEL: test27
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 -128
@@ -219,7 +219,7 @@ define i32 @test27(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test28(i64* %p, i32 %i) {
+define i64 @test28(i64* %p) {
 ; CHECK-LABEL: test28
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -128
@@ -227,7 +227,7 @@ define i64 @test28(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test29(i8* %p, i32 %i) {
+define i8 @test29(i8* %p) {
 ; CHECK-LABEL: test29
 ; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 -256
@@ -235,7 +235,7 @@ define i8 @test29(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test30(i16* %p, i32 %i) {
+define i16 @test30(i16* %p) {
 ; CHECK-LABEL: test30
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 -256
@@ -243,7 +243,7 @@ define i16 @test30(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test31(i32* %p, i32 %i) {
+define i32 @test31(i32* %p) {
 ; CHECK-LABEL: test31
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 -256
@@ -251,7 +251,7 @@ define i32 @test31(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test32(i64* %p, i32 %i) {
+define i64 @test32(i64* %p) {
 ; CHECK-LABEL: test32
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -256
@@ -259,7 +259,7 @@ define i64 @test32(i64* %p, i32 %i) {
   ret i64 %v
 }
 
-define i8 @test33(i8* %p, i32 %i) {
+define i8 @test33(i8* %p) {
 ; CHECK-LABEL: test33
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8*
   %a = getelementptr inbounds i8, i8* %p, i32 -512
@@ -267,7 +267,7 @@ define i8 @test33(i8* %p, i32 %i) {
   ret i8 %v
 }
 
-define i16 @test34(i16* %p, i32 %i) {
+define i16 @test34(i16* %p) {
 ; CHECK-LABEL: test34
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
   %a = getelementptr inbounds i16, i16* %p, i32 -512
@@ -275,7 +275,7 @@ define i16 @test34(i16* %p, i32 %i) {
   ret i16 %v
 }
 
-define i32 @test35(i32* %p, i32 %i) {
+define i32 @test35(i32* %p) {
 ; CHECK-LABEL: test35
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %a = getelementptr inbounds i32, i32* %p, i32 -512
@@ -283,7 +283,7 @@ define i32 @test35(i32* %p, i32 %i) {
   ret i32 %v
 }
 
-define i64 @test36(i64* %p, i32 %i) {
+define i64 @test36(i64* %p) {
 ; CHECK-LABEL: test36
 ; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %a = getelementptr inbounds i64, i64* %p, i32 -512
diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll
index 7319efb413d..b7a615f55cd 100644
--- a/test/Analysis/CostModel/X86/arith.ll
+++ b/test/Analysis/CostModel/X86/arith.ll
@@ -436,7 +436,7 @@ define i32 @mul(i32 %arg) {
   %A = mul <2 x i64> undef, undef
   ; SSSE3: cost of 16 {{.*}} %B = mul
   ; SSE42: cost of 16 {{.*}} %B = mul
-  ; AVX: cost of 16 {{.*}} %B = mul
+  ; AVX: cost of 18 {{.*}} %B = mul
   ; AVX2: cost of 8 {{.*}} %B = mul
   ; AVX512F: cost of 8 {{.*}} %B = mul
   ; AVX512BW: cost of 8 {{.*}} %B = mul
@@ -444,7 +444,7 @@ define i32 @mul(i32 %arg) {
   %B = mul <4 x i64> undef, undef
   ; SSSE3: cost of 32 {{.*}} %C = mul
   ; SSE42: cost of 32 {{.*}} %C = mul
-  ; AVX: cost of 32 {{.*}} %C = mul
+  ; AVX: cost of 36 {{.*}} %C = mul
   ; AVX2: cost of 16 {{.*}} %C = mul
   ; AVX512F: cost of 8 {{.*}} %C = mul
   ; AVX512BW: cost of 8 {{.*}} %C = mul
diff --git a/test/Analysis/CostModel/X86/shuffle-broadcast.ll b/test/Analysis/CostModel/X86/shuffle-broadcast.ll
index a829a47f89f..86cf7569a72 100644
--- a/test/Analysis/CostModel/X86/shuffle-broadcast.ll
+++ b/test/Analysis/CostModel/X86/shuffle-broadcast.ll
@@ -18,14 +18,150 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
 
   ; SSE: cost of 1 {{.*}} %V256 = shufflevector
-  ; AVX: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
   ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
 
   ; SSE: cost of 1 {{.*}} %V512 = shufflevector
-  ; AVX: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
   ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
 
   ret void
 }
+
+; CHECK-LABEL: 'test_vXi64'
+define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXf32'
+define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi32'
+define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi16'
+define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
+  ; SSE2: cost of 2 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+
+  ; SSE2: cost of 2 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 3 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+
+  ; SSE2: cost of 2 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 3 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi8'
+define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
+  ; SSE2: cost of 3 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+
+  ; SSE2: cost of 3 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+
+  ; SSE2: cost of 3 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/vdiv-cost.ll b/test/Analysis/CostModel/X86/vdiv-cost.ll
index c8e4557cbef..a45bb4b3d0d 100644
--- a/test/Analysis/CostModel/X86/vdiv-cost.ll
+++ b/test/Analysis/CostModel/X86/vdiv-cost.ll
@@ -1,13 +1,20 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSSE3
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ
 
 define <4 x i32> @test1(<4 x i32> %a) {
   %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
   ret <4 x i32> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test1':
-; SSE2: Found an estimated cost of 15 for instruction:   %div
-; AVX2: Found an estimated cost of 15 for instruction:   %div
+; SSE: Found an estimated cost of 15 for instruction:   %div
+; AVX: Found an estimated cost of 15 for instruction:   %div
+; AVX512: Found an estimated cost of 15 for instruction:   %div
 }
 
 define <8 x i32> @test2(<8 x i32> %a) {
@@ -15,8 +22,10 @@ define <8 x i32> @test2(<8 x i32> %a) {
   ret <8 x i32> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test2':
-; SSE2: Found an estimated cost of 30 for instruction:   %div
+; SSE: Found an estimated cost of 30 for instruction:   %div
+; AVX1: Found an estimated cost of 30 for instruction:   %div
 ; AVX2: Found an estimated cost of 15 for instruction:   %div
+; AVX512: Found an estimated cost of 15 for instruction:   %div
 }
 
 define <8 x i16> @test3(<8 x i16> %a) {
@@ -24,8 +33,9 @@ define <8 x i16> @test3(<8 x i16> %a) {
   ret <8 x i16> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test3':
-; SSE2: Found an estimated cost of 6 for instruction:   %div
-; AVX2: Found an estimated cost of 6 for instruction:   %div
+; SSE: Found an estimated cost of 6 for instruction:   %div
+; AVX: Found an estimated cost of 6 for instruction:   %div
+; AVX512: Found an estimated cost of 6 for instruction:   %div
 }
 
 define <16 x i16> @test4(<16 x i16> %a) {
@@ -33,8 +43,10 @@ define <16 x i16> @test4(<16 x i16> %a) {
   ret <16 x i16> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test4':
-; SSE2: Found an estimated cost of 12 for instruction:   %div
+; SSE: Found an estimated cost of 12 for instruction:   %div
+; AVX1: Found an estimated cost of 12 for instruction:   %div
 ; AVX2: Found an estimated cost of 6 for instruction:   %div
+; AVX512: Found an estimated cost of 6 for instruction:   %div
 }
 
 define <8 x i16> @test5(<8 x i16> %a) {
@@ -42,8 +54,9 @@ define <8 x i16> @test5(<8 x i16> %a) {
   ret <8 x i16> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test5':
-; SSE2: Found an estimated cost of 6 for instruction:   %div
-; AVX2: Found an estimated cost of 6 for instruction:   %div
+; SSE: Found an estimated cost of 6 for instruction:   %div
+; AVX: Found an estimated cost of 6 for instruction:   %div
+; AVX512: Found an estimated cost of 6 for instruction:   %div
 }
 
 define <16 x i16> @test6(<16 x i16> %a) {
@@ -51,8 +64,10 @@ define <16 x i16> @test6(<16 x i16> %a) {
   ret <16 x i16> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test6':
-; SSE2: Found an estimated cost of 12 for instruction:   %div
+; SSE: Found an estimated cost of 12 for instruction:   %div
+; AVX1: Found an estimated cost of 12 for instruction:   %div
 ; AVX2: Found an estimated cost of 6 for instruction:   %div
+; AVX512: Found an estimated cost of 6 for instruction:   %div
 }
 
 define <16 x i8> @test7(<16 x i8> %a) {
@@ -60,8 +75,9 @@ define <16 x i8> @test7(<16 x i8> %a) {
   ret <16 x i8> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test7':
-; SSE2: Found an estimated cost of 320 for instruction:   %div
-; AVX2: Found an estimated cost of 320 for instruction:   %div
+; SSE: Found an estimated cost of 320 for instruction:   %div
+; AVX: Found an estimated cost of 320 for instruction:   %div
+; AVX512: Found an estimated cost of 320 for instruction:   %div
 }
 
 define <4 x i32> @test8(<4 x i32> %a) {
@@ -69,8 +85,9 @@ define <4 x i32> @test8(<4 x i32> %a) {
   ret <4 x i32> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test8':
-; SSE2: Found an estimated cost of 19 for instruction:   %div
-; AVX2: Found an estimated cost of 15 for instruction:   %div
+; SSE: Found an estimated cost of 19 for instruction:   %div
+; AVX: Found an estimated cost of 15 for instruction:   %div
+; AVX512: Found an estimated cost of 15 for instruction:   %div
 }
 
 define <8 x i32> @test9(<8 x i32> %a) {
@@ -78,8 +95,10 @@ define <8 x i32> @test9(<8 x i32> %a) {
   ret <8 x i32> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test9':
-; SSE2: Found an estimated cost of 38 for instruction:   %div
+; SSE: Found an estimated cost of 38 for instruction:   %div
+; AVX1: Found an estimated cost of 38 for instruction:   %div
 ; AVX2: Found an estimated cost of 15 for instruction:   %div
+; AVX512: Found an estimated cost of 15 for instruction:   %div
 }
 
 define <8 x i32> @test10(<8 x i32> %a) {
@@ -87,6 +106,17 @@ define <8 x i32> @test10(<8 x i32> %a) {
   ret <8 x i32> %div
 
 ; CHECK: 'Cost Model Analysis' for function 'test10':
-; SSE2: Found an estimated cost of 160 for instruction:   %div
-; AVX2: Found an estimated cost of 160 for instruction:   %div
+; SSE: Found an estimated cost of 160 for instruction:   %div
+; AVX: Found an estimated cost of 160 for instruction:   %div
+; AVX512: Found an estimated cost of 160 for instruction:   %div
+}
+
+define <16 x i32> @test11(<16 x i32> %a) {
+  %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <16 x i32> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test11':
+; SSE: Found an estimated cost of 320 for instruction:   %div
+; AVX: Found an estimated cost of 320 for instruction:   %div
+; AVX512: Found an estimated cost of 320 for instruction:   %div
 }
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index e53e40b57e1..888164df75f 100644
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -1,9 +1,12 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK -check-prefix=XOP --check-prefix=XOPAVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK -check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ; Verify the cost of vector arithmetic shift right instructions.
 
@@ -17,6 +20,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <2 x i64> %a, %b
   ret <2 x i64> %shift
@@ -28,17 +32,31 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
+; SSE2: Found an estimated cost of 48 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <8 x i64> %a, %b
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <4 x i32> %a, %b
@@ -51,18 +69,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = ashr <16 x i32> %a, %b
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -74,17 +107,32 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <32 x i16> %a, %b
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8':
 ; SSE2: Found an estimated cost of 54 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -96,11 +144,26 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512F: Found an estimated cost of 24 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
+; SSE2: Found an estimated cost of 216 for instruction:   %shift
+; SSE41: Found an estimated cost of 96 for instruction:   %shift
+; AVX: Found an estimated cost of 96 for instruction:   %shift
+; AVX2: Found an estimated cost of 48 for instruction:   %shift
+; AVX512F: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <64 x i8> %a, %b
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Variable Shifts
 ;
@@ -111,6 +174,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
   %shift = ashr <2 x i64> %a, %splat
@@ -123,18 +187,33 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
   %shift = ashr <4 x i64> %a, %splat
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
+; SSE2: Found an estimated cost of 48 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %shift = ashr <8 x i64> %a, %splat
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -148,6 +227,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -155,12 +235,27 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+  %shift = ashr <16 x i32> %a, %splat
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = ashr <8 x i16> %a, %splat
@@ -173,18 +268,34 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i16> %a, %splat
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
+  %shift = ashr <32 x i16> %a, %splat
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8':
 ; SSE2: Found an estimated cost of 54 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i8> %a, %splat
@@ -197,12 +308,27 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = ashr <32 x i8> %a, %splat
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
+; SSE2: Found an estimated cost of 216 for instruction:   %shift
+; SSE41: Found an estimated cost of 96 for instruction:   %shift
+; AVX: Found an estimated cost of 96 for instruction:   %shift
+; AVX2: Found an estimated cost of 48 for instruction:   %shift
+; AVX512F: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
+  %shift = ashr <64 x i8> %a, %splat
+  ret <64 x i8> %shift
+}
+
 ;
 ; Constant Shifts
 ;
@@ -213,6 +339,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <2 x i64> %a, <i64 1, i64 7>
   ret <2 x i64> %shift
@@ -224,17 +351,31 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
+; SSE2: Found an estimated cost of 48 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
@@ -247,18 +388,33 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -270,17 +426,32 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8':
 ; SSE2: Found an estimated cost of 54 for instruction:   %shift
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -292,11 +463,25 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 48 for instruction:   %shift
 ; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
+; SSE2: Found an estimated cost of 216 for instruction:   %shift
+; SSE41: Found an estimated cost of 96 for instruction:   %shift
+; AVX: Found an estimated cost of 96 for instruction:   %shift
+; AVX2: Found an estimated cost of 48 for instruction:   %shift
+; AVX512F: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Constant Shifts
 ;
@@ -307,6 +492,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <2 x i64> %a, <i64 7, i64 7>
   ret <2 x i64> %shift
@@ -318,17 +504,31 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 8 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i32':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
@@ -341,18 +541,33 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i16':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
@@ -364,17 +579,32 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
 ; SSE2: Found an estimated cost of 4 for instruction:   %shift
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 4 for instruction:   %shift
+; AVX512: Found an estimated cost of 4 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -386,7 +616,21 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
+; AVX512: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
+
+define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 48 for instruction:   %shift
+; AVX512F: Found an estimated cost of 48 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <64 x i8> %shift
+}
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index 6d028268ea5..b3382253739 100644
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -1,9 +1,12 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -17,6 +20,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <2 x i64> %a, %b
@@ -29,18 +33,33 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <8 x i64> %a, %b
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i32> %a, %b
@@ -53,18 +72,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <16 x i32> %a, %b
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -76,17 +110,32 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <32 x i16> %a, %b
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 12 for instruction:   %shift
+; AVX512: Found an estimated cost of 12 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -98,11 +147,25 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <64 x i8> %a, %b
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Variable Shifts
 ;
@@ -113,6 +176,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -126,6 +190,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -133,12 +198,27 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %shift = lshr <8 x i64> %a, %splat
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -152,6 +232,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -159,12 +240,27 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+  %shift = lshr <16 x i32> %a, %splat
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = lshr <8 x i16> %a, %splat
@@ -177,18 +273,34 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i16> %a, %splat
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
+  %shift = lshr <32 x i16> %a, %splat
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 12 for instruction:   %shift
+; AVX512: Found an estimated cost of 12 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i8> %a, %splat
@@ -201,12 +313,27 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = lshr <32 x i8> %a, %splat
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
+  %shift = lshr <64 x i8> %a, %splat
+  ret <64 x i8> %shift
+}
+
 ;
 ; Constant Shifts
 ;
@@ -217,6 +344,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <2 x i64> %a, <i64 1, i64 7>
@@ -229,18 +357,33 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
@@ -253,18 +396,33 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
+; SSE2: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -276,17 +434,32 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 12 for instruction:   %shift
 ; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 12 for instruction:   %shift
+; AVX512: Found an estimated cost of 12 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -298,11 +471,25 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 24 for instruction:   %shift
 ; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Constant Shifts
 ;
@@ -313,6 +500,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <2 x i64> %a, <i64 7, i64 7>
@@ -325,18 +513,33 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i32':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
@@ -349,18 +552,33 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 8 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i16':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
@@ -372,17 +590,32 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -394,7 +627,21 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
+
+define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 8 for instruction:   %shift
+  %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <64 x i8> %shift
+}
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index 60ba3adea42..804c5a76c31 100644
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -1,9 +1,12 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+xop,+avx2 -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=XOP --check-prefix=XOPAVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512dq -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw -cost-model -analyze | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ; Verify the cost of vector shift left instructions.
 
@@ -18,6 +21,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <2 x i64> %a, %b
@@ -30,18 +34,33 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i64> %a, %b
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <8 x i64> %a, %b
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
 ; SSE2: Found an estimated cost of 10 for instruction:   %shift
 ; SSE41: Found an estimated cost of 10 for instruction:   %shift
 ; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i32> %a, %b
@@ -54,18 +73,33 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 20 for instruction:   %shift
 ; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i32> %a, %b
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
+; SSE2: Found an estimated cost of 40 for instruction:   %shift
+; SSE41: Found an estimated cost of 40 for instruction:   %shift
+; AVX: Found an estimated cost of 40 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <16 x i32> %a, %b
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -77,17 +111,32 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %shift = shl <32 x i16> %a, %b
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -99,11 +148,25 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i8> %a, %b
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %shift = shl <64 x i8> %a, %b
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Variable Shifts
 ;
@@ -114,6 +177,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -127,6 +191,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -134,12 +199,27 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %shift = shl <8 x i64> %a, %splat
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
 ; SSE2: Found an estimated cost of 10 for instruction:   %shift
 ; SSE41: Found an estimated cost of 10 for instruction:   %shift
 ; AVX: Found an estimated cost of 10 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -153,6 +233,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE41: Found an estimated cost of 20 for instruction:   %shift
 ; AVX: Found an estimated cost of 20 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -160,12 +241,27 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
+; SSE2: Found an estimated cost of 40 for instruction:   %shift
+; SSE41: Found an estimated cost of 40 for instruction:   %shift
+; AVX: Found an estimated cost of 40 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+  %shift = shl <16 x i32> %a, %splat
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
 ; SSE41: Found an estimated cost of 14 for instruction:   %shift
 ; AVX: Found an estimated cost of 14 for instruction:   %shift
 ; AVX2: Found an estimated cost of 14 for instruction:   %shift
+; AVX512: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = shl <8 x i16> %a, %splat
@@ -178,18 +274,34 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE41: Found an estimated cost of 28 for instruction:   %shift
 ; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
+; AVX512: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i16> %a, %splat
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
+; SSE2: Found an estimated cost of 128 for instruction:   %shift
+; SSE41: Found an estimated cost of 56 for instruction:   %shift
+; AVX: Found an estimated cost of 56 for instruction:   %shift
+; AVX2: Found an estimated cost of 20 for instruction:   %shift
+; AVX512F: Found an estimated cost of 20 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
+  %shift = shl <32 x i16> %a, %splat
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i8> %a, %splat
@@ -202,12 +314,27 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
   %shift = shl <32 x i8> %a, %splat
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
+  %shift = shl <64 x i8> %a, %splat
+  ret <64 x i8> %shift
+}
+
 ;
 ; Constant Shifts
 ;
@@ -218,6 +345,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 4 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <2 x i64> %a, <i64 1, i64 7>
@@ -230,18 +358,33 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 8 for instruction:   %shift
 ; AVX: Found an estimated cost of 8 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
+; SSE2: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 16 for instruction:   %shift
+; AVX: Found an estimated cost of 16 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
 ; SSE2: Found an estimated cost of 6 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
@@ -254,18 +397,33 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
+; SSE2: Found an estimated cost of 24 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -277,18 +435,34 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 4 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 8 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512F: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
 ; SSE41: Found an estimated cost of 11 for instruction:   %shift
 ; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -300,11 +474,25 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 22 for instruction:   %shift
 ; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
+; SSE2: Found an estimated cost of 104 for instruction:   %shift
+; SSE41: Found an estimated cost of 44 for instruction:   %shift
+; AVX: Found an estimated cost of 44 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+  ret <64 x i8> %shift
+}
+
 ;
 ; Uniform Constant Shifts
 ;
@@ -315,6 +503,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <2 x i64> %a, <i64 7, i64 7>
@@ -327,18 +516,33 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
   ret <4 x i64> %shift
 }
 
+define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+  ret <8 x i64> %shift
+}
+
 define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i32':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
@@ -351,18 +555,33 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i32> %shift
 }
 
+define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i32> %shift
+}
+
 define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i16':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <8 x i16> %shift
@@ -374,18 +593,34 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
 }
 
+define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 2 for instruction:   %shift
+; AVX512F: Found an estimated cost of 2 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction:   %shift
+  %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <32 x i16> %shift
+}
+
 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
 ; SSE2: Found an estimated cost of 1 for instruction:   %shift
 ; SSE41: Found an estimated cost of 1 for instruction:   %shift
 ; AVX: Found an estimated cost of 1 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
+; AVX512: Found an estimated cost of 1 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -397,11 +632,25 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; SSE41: Found an estimated cost of 2 for instruction:   %shift
 ; AVX: Found an estimated cost of 2 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
+; AVX512: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
 }
 
+define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
+; SSE2: Found an estimated cost of 4 for instruction:   %shift
+; SSE41: Found an estimated cost of 4 for instruction:   %shift
+; AVX: Found an estimated cost of 4 for instruction:   %shift
+; AVX2: Found an estimated cost of 22 for instruction:   %shift
+; AVX512F: Found an estimated cost of 22 for instruction:   %shift
+; AVX512BW: Found an estimated cost of 2 for instruction:   %shift
+; XOP: Found an estimated cost of 4 for instruction:   %shift
+  %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <64 x i8> %shift
+}
+
 ;
 ; Special Cases
 ;
diff --git a/test/Bitcode/summary_version.ll b/test/Bitcode/summary_version.ll
index dfb9e9b15e7..81025a221bb 100644
--- a/test/Bitcode/summary_version.ll
+++ b/test/Bitcode/summary_version.ll
@@ -2,7 +2,7 @@
 ; RUN: opt  -module-summary  %s -o - | llvm-bcanalyzer -dump | FileCheck %s
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK: <VERSION op0=2/>
+; CHECK: <VERSION op0=3/>
 
 
 
diff --git a/test/Bitcode/thinlto-function-summary.ll b/test/Bitcode/thinlto-function-summary.ll
index e42c55c1c2e..594aaab566d 100644
--- a/test/Bitcode/thinlto-function-summary.ll
+++ b/test/Bitcode/thinlto-function-summary.ll
@@ -10,7 +10,7 @@
 ; BC-NEXT: <PERMODULE {{.*}} op0=1 op1=0
 ; BC-NEXT: <PERMODULE {{.*}} op0=2 op1=0
 ; BC-NEXT: <PERMODULE {{.*}} op0=3 op1=7
-; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=32
+; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=16
 ; BC-NEXT: <ALIAS {{.*}} op0=5 op1=0 op2=3
 ; BC-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 ; BC-NEXT: <VALUE_SYMTAB
diff --git a/test/Bitcode/thinlto-summary-section.ll b/test/Bitcode/thinlto-summary-section.ll
index d120622db81..3d67279617e 100644
--- a/test/Bitcode/thinlto-summary-section.ll
+++ b/test/Bitcode/thinlto-summary-section.ll
@@ -4,8 +4,10 @@
 ; RUN: llvm-lto -thinlto -o %t2 %t.o
 ; RUN: llvm-bcanalyzer -dump %t2.thinlto.bc | FileCheck %s --check-prefix=COMBINED
 
-; CHECK: <PERMODULE {{.*}} op1=16
-; COMBINED-DAG: <COMBINED {{.*}} op2=16
-define void @functionWithSection() section "some_section" {
+; Flags should be 0x17 (23) for local linkage (0x3) and not being importable
+; (0x10) due to local linkage plus having a section.
+; CHECK: <PERMODULE {{.*}} op1=23
+; COMBINED-DAG: <COMBINED {{.*}} op2=23
+define internal void @functionWithSection() section "some_section" {
     ret void
 }
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
index 22210e49bd7..ece5a858b49 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
@@ -2836,13 +2836,13 @@ registers:
 
 # CHECK:  body:
 # CHECK:    %wzr = SUBSWrr %0, %0, implicit-def %nzcv
-# CHECK:    %1 = CSINCWr %wzr, %wzr, 0, implicit %nzcv
+# CHECK:    %1 = CSINCWr %wzr, %wzr, 1, implicit %nzcv
 
 # CHECK:    %xzr = SUBSXrr %2, %2, implicit-def %nzcv
-# CHECK:    %3 = CSINCWr %wzr, %wzr, 2, implicit %nzcv
+# CHECK:    %3 = CSINCWr %wzr, %wzr, 3, implicit %nzcv
 
 # CHECK:    %xzr = SUBSXrr %4, %4, implicit-def %nzcv
-# CHECK:    %5 = CSINCWr %wzr, %wzr, 1, implicit %nzcv
+# CHECK:    %5 = CSINCWr %wzr, %wzr, 0, implicit %nzcv
 
 body:             |
   bb.0:
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index e023e32bb7b..15b4012f383 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -52,10 +52,10 @@ define void @allocai64() {
 ; CHECK: body:
 ;
 ; ABI/constant lowering and IR-level entry basic block.
-; CHECK: {{bb.[0-9]+}}:
+; CHECK: {{bb.[0-9]+}} (%ir-block.{{[0-9]+}}):
 ;
 ; Make sure we have one successor and only one.
-; CHECK-NEXT: successors: %[[END:bb.[0-9]+]](0x80000000)
+; CHECK-NEXT: successors: %[[END:bb.[0-9]+.end]](0x80000000)
 ;
 ; Check that we emit the correct branch.
 ; CHECK: G_BR %[[END]]
@@ -74,10 +74,10 @@ end:
 ; CHECK: body:
 ;
 ; ABI/constant lowering and IR-level entry basic block.
-; CHECK: {{bb.[0-9]+}}:
+; CHECK: {{bb.[0-9]+}} (%ir-block.{{[0-9]+}}):
 ; Make sure we have two successors
-; CHECK-NEXT: successors: %[[TRUE:bb.[0-9]+]](0x40000000),
-; CHECK:                  %[[FALSE:bb.[0-9]+]](0x40000000)
+; CHECK-NEXT: successors: %[[TRUE:bb.[0-9]+.true]](0x40000000),
+; CHECK:                  %[[FALSE:bb.[0-9]+.false]](0x40000000)
 ;
 ; CHECK: [[ADDR:%.*]](p0) = COPY %x0
 ;
@@ -100,6 +100,74 @@ false:
   ret void
 }
 
+; Tests for switch.
+; This gets lowered to a very straightforward sequence of comparisons for now.
+; CHECK-LABEL: name: switch
+; CHECK: body:
+;
+; CHECK: {{bb.[0-9]+.entry}}:
+; CHECK-NEXT: successors: %[[BB_CASE100:bb.[0-9]+.case100]](0x40000000), %[[BB_NOTCASE100_CHECKNEXT:bb.[0-9]+.entry]](0x40000000)
+; CHECK: %0(s32) = COPY %w0
+; CHECK: %[[reg100:[0-9]+]](s32) = G_CONSTANT i32 100
+; CHECK: %[[reg200:[0-9]+]](s32) = G_CONSTANT i32 200
+; CHECK: %[[reg0:[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: %[[reg1:[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK: %[[reg2:[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK: %[[regicmp100:[0-9]+]](s1) = G_ICMP intpred(eq), %[[reg100]](s32), %0
+; CHECK: G_BRCOND %[[regicmp100]](s1), %[[BB_CASE100]]
+; CHECK: G_BR %[[BB_NOTCASE100_CHECKNEXT]]
+;
+; CHECK: [[BB_CASE100]]:
+; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000)
+; CHECK: %[[regretc100:[0-9]+]](s32) = G_ADD %0, %[[reg1]]
+; CHECK: G_BR %[[BB_RET]]
+; CHECK: [[BB_NOTCASE100_CHECKNEXT]]:
+; CHECK-NEXT: successors: %[[BB_CASE200:bb.[0-9]+.case200]](0x40000000), %[[BB_NOTCASE200_CHECKNEXT:bb.[0-9]+.entry]](0x40000000)
+; CHECK: %[[regicmp200:[0-9]+]](s1) = G_ICMP intpred(eq), %[[reg200]](s32), %0
+; CHECK: G_BRCOND %[[regicmp200]](s1), %[[BB_CASE200]]
+; CHECK: G_BR %[[BB_NOTCASE200_CHECKNEXT]]
+;
+; CHECK: [[BB_CASE200]]:
+; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000)
+; CHECK: %[[regretc200:[0-9]+]](s32) = G_ADD %0, %[[reg2]]
+; CHECK: G_BR %[[BB_RET]]
+; CHECK: [[BB_NOTCASE200_CHECKNEXT]]:
+; CHECK-NEXT: successors: %[[BB_DEFAULT:bb.[0-9]+.default]](0x80000000)
+; CHECK: G_BR %[[BB_DEFAULT]]
+;
+; CHECK: [[BB_DEFAULT]]:
+; CHECK-NEXT: successors: %[[BB_RET]](0x80000000)
+; CHECK: %[[regretdefault:[0-9]+]](s32) = G_ADD %0, %[[reg0]]
+; CHECK: G_BR %[[BB_RET]]
+;
+; CHECK: [[BB_RET]]:
+; CHECK-NEXT: %[[regret:[0-9]+]](s32) = PHI %[[regretdefault]](s32), %[[BB_DEFAULT]], %[[regretc100]](s32), %[[BB_CASE100]]
+; CHECK:  %w0 = COPY %[[regret]](s32)
+; CHECK:  RET_ReallyLR implicit %w0
+define i32 @switch(i32 %argc) {
+entry:
+  switch i32 %argc, label %default [
+    i32 100, label %case100
+    i32 200, label %case200
+  ]
+
+default:
+  %tmp0 = add i32 %argc, 0
+  br label %return
+
+case100:
+  %tmp1 = add i32 %argc, 1
+  br label %return
+
+case200:
+  %tmp2 = add i32 %argc, 2
+  br label %return
+
+return:
+  %res = phi i32 [ %tmp0, %default ], [ %tmp1, %case100 ], [ %tmp2, %case200 ]
+  ret i32 %res
+}
+
 ; Tests for or.
 ; CHECK-LABEL: name: ori64
 ; CHECK: [[ARG1:%[0-9]+]](s64) = COPY %x0
@@ -223,11 +291,11 @@ define i64* @trivial_bitcast(i8* %a) {
 
 ; CHECK-LABEL: name: trivial_bitcast_with_copy
 ; CHECK:     [[A:%[0-9]+]](p0) = COPY %x0
-; CHECK:     G_BR %[[CAST:bb\.[0-9]+]]
+; CHECK:     G_BR %[[CAST:bb\.[0-9]+.cast]]
 
 ; CHECK: [[CAST]]:
 ; CHECK:     {{%[0-9]+}}(p0) = COPY [[A]]
-; CHECK:     G_BR %[[END:bb\.[0-9]+]]
+; CHECK:     G_BR %[[END:bb\.[0-9]+.end]]
 
 ; CHECK: [[END]]:
 define i64* @trivial_bitcast_with_copy(i8* %a) {
@@ -324,8 +392,8 @@ define void @intrinsics(i32 %cur, i32 %bits) {
 }
 
 ; CHECK-LABEL: name: test_phi
-; CHECK:     G_BRCOND {{%.*}}, %[[TRUE:bb\.[0-9]+]]
-; CHECK:     G_BR %[[FALSE:bb\.[0-9]+]]
+; CHECK:     G_BRCOND {{%.*}}, %[[TRUE:bb\.[0-9]+.true]]
+; CHECK:     G_BR %[[FALSE:bb\.[0-9]+.false]]
 
 ; CHECK: [[TRUE]]:
 ; CHECK:     [[RES1:%[0-9]+]](s32) = G_LOAD
@@ -933,7 +1001,7 @@ define void @test_large_const(i128* %addr) {
 ; correct.
 define i8* @test_const_placement() {
 ; CHECK-LABEL: name: test_const_placement
-; CHECK: bb.{{[0-9]+}}:
+; CHECK: bb.{{[0-9]+}} (%ir-block.{{[0-9]+}}):
 ; CHECK:   [[VAL_INT:%[0-9]+]](s32) = G_CONSTANT i32 42
 ; CHECK:   [[VAL:%[0-9]+]](p0) = G_INTTOPTR [[VAL_INT]](s32)
 ; CHECK:   G_BR
diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
index 9051b2388fc..718364af2ac 100644
--- a/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
@@ -8,8 +8,8 @@ declare i32 @llvm.eh.typeid.for(i8*)
 
 ; CHECK: name: bar
 ; CHECK: body:
-; CHECK-NEXT:   bb.1:
-; CHECK:     successors: %[[GOOD:bb.[0-9]+]]{{.*}}%[[BAD:bb.[0-9]+]]
+; CHECK-NEXT:   bb.1 (%ir-block.0):
+; CHECK:     successors: %[[GOOD:bb.[0-9]+.continue]]{{.*}}%[[BAD:bb.[0-9]+.broken]]
 ; CHECK:     EH_LABEL
 ; CHECK:     %w0 = COPY
 ; CHECK:     BL @foo, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %w0, implicit-def %w0
diff --git a/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
index 4a3696501fd..727c189721f 100644
--- a/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O3 -aarch64-enable-collect-loh -aarch64-collect-loh-bb-only=true -aarch64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios -O3 -aarch64-enable-collect-loh | FileCheck %s
 ; Check that the LOH analysis does not crash when the analysed chained
 ; contains instructions that are filtered out.
 ;
diff --git a/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
index e3df4182ddc..773286ef1d7 100644
--- a/test/CodeGen/AArch64/arm64-collect-loh-str.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-enable-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
 ; Test case for <rdar://problem/15942912>.
 ; AdrpAddStr cannot be used when the store uses same
 ; register as address and value. Indeed, the related
diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll
index b697b6eced3..c7ba989d933 100644
--- a/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-enable-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -O2 -aarch64-enable-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
+; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-linux-gnu -O2 | FileCheck %s --check-prefix=CHECK-ELF
 
 ; CHECK-ELF-NOT: .loh
 ; CHECK-ELF-NOT: AdrpAdrp
@@ -633,11 +633,14 @@ define void @setL(<1 x i8> %t) {
 ; a tuple register to appear in the lowering. Thus, the target
 ; cpu is required to have the problem reproduced.
 ; CHECK-LABEL: _uninterestingSub
+; CHECK: [[LOH_LABEL0:Lloh[0-9]+]]:
 ; CHECK: adrp [[ADRP_REG:x[0-9]+]], [[CONSTPOOL:lCPI[0-9]+_[0-9]+]]@PAGE
-; CHECK-NEXT: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF]
+; CHECK: [[LOH_LABEL1:Lloh[0-9]+]]:
+; CHECK: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF]
 ; The tuple comes from the next instruction.
 ; CHECK-NEXT: tbl.16b v{{[0-9]+}}, { v{{[0-9]+}}, v{{[0-9]+}} }, v[[IDX]]
 ; CHECK: ret
+; CHECK: .loh AdrpLdr [[LOH_LABEL0]], [[LOH_LABEL1]]
 define void @uninterestingSub(i8* nocapture %row) #0 {
   %tmp = bitcast i8* %row to <16 x i8>*
   %tmp1 = load <16 x i8>, <16 x i8>* %tmp, align 16
@@ -664,10 +667,10 @@ entry:
 if.then.i:
   ret void
 if.end.i:
-; CHECK: .loh AdrpAdrp Lloh91, Lloh93
-; CHECK: .loh AdrpLdr Lloh91, Lloh92
-; CHECK: .loh AdrpLdrGot Lloh93, Lloh95
-; CHECK: .loh AdrpLdrGot Lloh94, Lloh96
+; CHECK: .loh AdrpLdrGot
+; CHECK: .loh AdrpLdrGot
+; CHECK: .loh AdrpAdrp
+; CHECK: .loh AdrpLdr
   %mul.i.i.i = fmul double undef, 1.000000e-06
   %add.i.i.i = fadd double undef, %mul.i.i.i
   %sub.i.i = fsub double %add.i.i.i, undef
diff --git a/test/CodeGen/AArch64/loh.mir b/test/CodeGen/AArch64/loh.mir
new file mode 100644
index 00000000000..1d08ebdc579
--- /dev/null
+++ b/test/CodeGen/AArch64/loh.mir
@@ -0,0 +1,193 @@
+# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s
+# REQUIRES: asserts
+--- |
+  define void @func0() { ret void }
+
+  declare void @extfunc()
+
+  @g0 = external global i32
+  @g1 = external global i32
+  @g2 = external global i32
+  @g3 = external global i32
+  @g4 = external global i32
+  @g5 = external global i32
+...
+---
+# Check various LOH variants. Remember that the algorithms walks the basic
+# blocks backwards.
+# CHECK-LABEL: ********** AArch64 Collect LOH **********
+# CHECK-LABEL: Looking in function func0
+name: func0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK: Adding MCLOH_AdrpAdrp:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g4>
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdrp:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g2>
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdrp:
+    ; CHECK-NEXT: %X0<def> = ADRP <ga:@g0>
+    ; CHECK-NEXT: %X0<def> = ADRP <ga:@g1>
+    %x0 = ADRP target-flags(aarch64-page) @g0
+    %x0 = ADRP target-flags(aarch64-page) @g1
+    %x1 = ADRP target-flags(aarch64-page) @g2
+    %x1 = ADRP target-flags(aarch64-page) @g3
+    %x1 = ADRP target-flags(aarch64-page) @g4
+
+  bb.1:
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdd:
+    ; CHECK-NEXT: %X20<def> = ADRP <ga:@g0>
+    ; CHECK-NEXT: %X3<def> = ADDXri %X20, <ga:@g0>
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdd:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g0>
+    ; CHECK-NEXT: %X1<def> = ADDXri %X1, <ga:@g0>
+    %x1 = ADRP target-flags(aarch64-page) @g0
+    %x9 = SUBXri undef %x11, 5, 0 ; should not affect MCLOH formation
+    %x1 = ADDXri %x1, target-flags(aarch64-pageoff) @g0, 0
+    %x20 = ADRP target-flags(aarch64-page) @g0
+    BL @extfunc, csr_aarch64_aapcs ; should not clobber X20
+    %x3 = ADDXri %x20, target-flags(aarch64-pageoff) @g0, 0
+
+  bb.2:
+    ; CHECK-NOT: MCLOH_AdrpAdd
+    %x9 = ADRP target-flags(aarch64-page) @g0
+    BL @extfunc, csr_aarch64_aapcs ; clobbers x9
+    ; Verification requires the use of 'undef' in front of the clobbered %x9
+    %x9 = ADDXri undef %x9, target-flags(aarch64-pageoff) @g0, 0
+
+  bb.3:
+    ; CHECK-NOT: MCLOH_AdrpAdd
+    %x10 = ADRP target-flags(aarch64-page) @g0
+    HINT 0, implicit def %x10 ; clobbers x10
+    %x10 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0
+
+  bb.4:
+    ; Cannot produce a LOH for multiple users
+    ; CHECK-NOT: MCLOH_AdrpAdd
+    %x10 = ADRP target-flags(aarch64-page) @g0
+    HINT 0, implicit def %x10 ; clobbers x10
+    %x11 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0
+    %x12 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0
+
+  bb.5:
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdr:
+    ; CHECK-NEXT: %X5<def> = ADRP <ga:@g2>
+    ; CHECK-NEXT: %S6<def> = LDRSui %X5, <ga:@g2>
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdr:
+    ; CHECK-NEXT: %X4<def> = ADRP <ga:@g2>
+    ; CHECK-NEXT: %X4<def> = LDRXui %X4, <ga:@g2>
+    %x4 = ADRP target-flags(aarch64-page) @g2
+    %x4 = LDRXui %x4, target-flags(aarch64-pageoff) @g2
+    %x5 = ADRP target-flags(aarch64-page) @g2
+    %s6 = LDRSui %x5, target-flags(aarch64-pageoff) @g2
+
+  bb.6:
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdrGot:
+    ; CHECK-NEXT: %X5<def> = ADRP <ga:@g2>
+    ; CHECK-NEXT: %X6<def> = LDRXui %X5, <ga:@g2>
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdrGot:
+    ; CHECK-NEXT: %X4<def> = ADRP <ga:@g2>
+    ; CHECK-NEXT: %X4<def> = LDRXui %X4, <ga:@g2>
+    %x4 = ADRP target-flags(aarch64-page, aarch64-got) @g2
+    %x4 = LDRXui %x4, target-flags(aarch64-pageoff, aarch64-got) @g2
+    %x5 = ADRP target-flags(aarch64-page, aarch64-got) @g2
+    %x6 = LDRXui %x5, target-flags(aarch64-pageoff, aarch64-got) @g2
+
+  bb.7:
+    ; CHECK-NOT: Adding MCLOH_AdrpLdrGot:
+    ; Loading a float value from a GOT table makes no sense so this should not
+    ; produce an LOH.
+    %x11 = ADRP target-flags(aarch64-page, aarch64-got) @g5
+    %s11 = LDRSui %x11, target-flags(aarch64-pageoff, aarch64-got) @g5
+
+  bb.8:
+    ; CHECK-NEXT: Adding MCLOH_AdrpAddLdr:
+    ; CHECK-NEXT: %X7<def> = ADRP <ga:@g3>[TF=1]
+    ; CHECK-NEXT: %X8<def> = ADDXri %X7, <ga:@g3>
+    ; CHECK-NEXT: %D1<def> = LDRDui %X8, 8
+    %x7 = ADRP target-flags(aarch64-page) @g3
+    %x8 = ADDXri %x7, target-flags(aarch64-pageoff) @g3, 0
+    %d1 = LDRDui %x8, 8
+
+  bb.9:
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdd:
+    ; CHECK-NEXT: %X3<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: %X3<def> = ADDXri %X3, <ga:@g3>
+    ; CHECK-NEXT: Adding MCLOH_AdrpAdd:
+    ; CHECK-NEXT: %X5<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: %X2<def> = ADDXri %X5, <ga:@g3>
+    ; CHECK-NEXT: Adding MCLOH_AdrpAddStr:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: %X1<def> = ADDXri %X1, <ga:@g3>
+    ; CHECK-NEXT: STRXui %XZR, %X1, 16
+    %x1 = ADRP target-flags(aarch64-page) @g3
+    %x1 = ADDXri %x1, target-flags(aarch64-pageoff) @g3, 0
+    STRXui %xzr, %x1, 16
+
+    ; This sequence should just produce an AdrpAdd (not AdrpAddStr)
+    %x5 = ADRP target-flags(aarch64-page) @g3
+    %x2 = ADDXri %x5, target-flags(aarch64-pageoff) @g3, 0
+    STRXui %x2, undef %x11, 16
+
+    ; This sequence should just produce an AdrpAdd (not AdrpAddStr)
+    %x3 = ADRP target-flags(aarch64-page) @g3
+    %x3 = ADDXri %x3, target-flags(aarch64-pageoff) @g3, 0
+    STRXui %x3, %x3, 16
+
+  bb.10:
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdr:
+    ; CHECK-NEXT: %X2<def> = ADRP <ga:@g3>
+    ; CHECK-NEXT: %X2<def> = LDRXui %X2, <ga:@g3>
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdrGotLdr:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g4>
+    ; CHECK-NEXT: %X1<def> = LDRXui %X1, <ga:@g4>
+    ; CHECK-NEXT: %X1<def> = LDRXui %X1, 24
+    %x1 = ADRP target-flags(aarch64-page, aarch64-got) @g4
+    %x1 = LDRXui %x1, target-flags(aarch64-pageoff, aarch64-got) @g4
+    %x1 = LDRXui %x1, 24
+    ; Should just produce a MCLOH_AdrpLdr (not MCLOH_AdrpLdrGotLdr)
+    %x2 = ADRP target-flags(aarch64-page) @g3
+    %x2 = LDRXui %x2, target-flags(aarch64-pageoff) @g3
+    %x2 = LDRXui %x2, 24
+
+  bb.11:
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdr
+    ; CHECK-NEXT: %X5<def> = ADRP <ga:@g1>
+    ; CHECK-NEXT: %X5<def> = LDRXui %X5, <ga:@g1>
+    ; CHECK-NEXT: Adding MCLOH_AdrpLdrGotStr:
+    ; CHECK-NEXT: %X1<def> = ADRP <ga:@g4>
+    ; CHECK-NEXT: %X1<def> = LDRXui %X1, <ga:@g4>
+    ; CHECK-NEXT: STRXui %XZR, %X1, 32
+    %x1 = ADRP target-flags(aarch64-page, aarch64-got) @g4
+    %x1 = LDRXui %x1, target-flags(aarch64-pageoff, aarch64-got) @g4
+    STRXui %xzr, %x1, 32
+    ; Should just produce a MCLOH_AdrpLdr (not MCLOH_AdrpLdrGotStr)
+    %x5 = ADRP target-flags(aarch64-page) @g1
+    %x5 = LDRXui %x5, target-flags(aarch64-pageoff) @g1
+    STRXui undef %x11, %x5, 32
+
+  bb.12:
+    ; CHECK-NOT: MCLOH_AdrpAdrp
+    ; CHECK: Adding MCLOH_AdrpAddLdr
+    ; %X9<def> = ADRP <ga:@g4>
+    ; %X9<def> = ADDXri %X9, <ga:@g4>
+    ; %X5<def> = LDRXui %X9, 0
+    %x9 = ADRP target-flags(aarch64-page, aarch64-got) @g4
+    %x9 = ADDXri %x9, target-flags(aarch64-pageoff, aarch64-got) @g4, 0
+    %x5 = LDRXui %x9, 0
+    %x9 = ADRP target-flags(aarch64-page, aarch64-got) @g5
+
+  bb.13:
+    successors: %bb.14
+    ; Cannot produce a LOH for multiple users
+    ; CHECK-NOT: MCLOH_AdrpAdd
+    %x10 = ADRP target-flags(aarch64-page) @g0
+    %x11 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0
+    B %bb.14
+
+  bb.14:
+    liveins: %x10
+    %x12 = ADDXri %x10, target-flags(aarch64-pageoff) @g0, 0
+...
diff --git a/test/CodeGen/AArch64/machine-scheduler.mir b/test/CodeGen/AArch64/machine-scheduler.mir
index e7e0dda53c5..933afdb6da9 100644
--- a/test/CodeGen/AArch64/machine-scheduler.mir
+++ b/test/CodeGen/AArch64/machine-scheduler.mir
@@ -21,8 +21,9 @@
 # CHECK: LDRWui %x0, 0
 # CHECK: LDRWui %x0, 1
 # CHECK: STRWui %w1, %x0, 2
-name:            load_imp-def
-body:             |
+name: load_imp-def
+tracksRegLiveness: true
+body: |
   bb.0.entry:
     liveins: %w1, %x0
     %w8 = LDRWui %x0, 1, implicit-def %x8  :: (load 4 from %ir.0)
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
index 28c8b5d73b0..d9662b69b12 100644
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -30,12 +30,11 @@
 ; ELF: Type: Function (0x2)
 ; ELF: }
 
+; HSA: .text
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
-; HSA: .text
-
 ; HSA-NOT: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index 78a5cdb576f..12c15441c0f 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -34,12 +34,12 @@
 ; ELF: Type: AMDGPU_HSA_KERNEL (0xA)
 ; ELF: }
 
+; HSA-NOT: .AMDGPU.config
+; HSA: .text
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
-; HSA: .text
-
 ; HSA: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
diff --git a/test/CodeGen/Generic/cfi-sections.ll b/test/CodeGen/Generic/cfi-sections.ll
new file mode 100644
index 00000000000..6e721d6df70
--- /dev/null
+++ b/test/CodeGen/Generic/cfi-sections.ll
@@ -0,0 +1,39 @@
+; When using Itanium ABI, do not emit .debug_frame.
+; RUNT: llc -mtriple=i386--linux -o - < %s | FileCheck %s -check-prefix=WITHOUT
+; RUNT: llc -mtriple=armv7-netbsd-eabi -o - < %s | FileCheck %s -check-prefix=WITHOUT
+
+; When using EHABI, do emit .debug_frame.
+; RUN: llc -mtriple=arm-linux -mcpu=cortex-a7 -mattr=v7 -o - < %s | FileCheck %s -check-prefix=WITH
+
+; REQUIRES: x86-registered-target
+; REQUIRES: arm-registered-target
+
+; WITH:        .cfi_sections .debug_frame
+; WITHOUT-NOT: .cfi_sections
+
+define i32 @foo() #0 !dbg !7 {
+  %1 = call i32 @bar()
+  %2 = call i32 @bar()
+  %3 = add nsw i32 %1, %2
+  ret i32 %3
+}
+
+declare i32 @bar() #1
+
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "cfi-sections.cc", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 1, !"min_enum_size", i32 4}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/MIR/AArch64/spill-fold.mir b/test/CodeGen/MIR/AArch64/spill-fold.mir
new file mode 100644
index 00000000000..05e7f7521ed
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/spill-fold.mir
@@ -0,0 +1,82 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass greedy -verify-machineinstrs  -o - %s | FileCheck %s
+--- |
+  define i64 @test_subreg_spill_fold() { ret i64 0 }
+  define i64 @test_subreg_spill_fold2() { ret i64 0 }
+  define i64 @test_subreg_spill_fold3() { ret i64 0 }
+  define i64 @test_subreg_fill_fold() { ret i64 0 }
+  define double @test_subreg_fill_fold2() { ret double 0.0 }
+...
+---
+# CHECK-LABEL: name: test_subreg_spill_fold
+# Ensure that the spilled subreg COPY is eliminated and folded into the spill store.
+name:            test_subreg_spill_fold
+registers:
+  - { id: 0, class: gpr64 }
+body:             |
+  bb.0:
+    ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0)
+    undef %0.sub_32 = COPY %wzr
+    INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp
+    %x0 = COPY %0
+    RET_ReallyLR implicit %x0
+...
+---
+# CHECK-LABEL: name: test_subreg_spill_fold2
+# Similar to test_subreg_spill_fold, but with a vreg0 register class not containing %WZR.
+name:            test_subreg_spill_fold2
+registers:
+  - { id: 0, class: gpr64sp }
+body:             |
+  bb.0:
+    ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0)
+    undef %0.sub_32 = COPY %wzr
+    INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp
+    %x0 = ADDXri %0, 1, 0
+    RET_ReallyLR implicit %x0
+...
+---
+# CHECK-LABEL: name: test_subreg_spill_fold3
+# Similar to test_subreg_spill_fold, but with a cross register class copy.
+name:            test_subreg_spill_fold3
+registers:
+  - { id: 0, class: fpr64 }
+body:             |
+  bb.0:
+    ; CHECK: STRXui %xzr, %stack.0, 0 :: (store 8 into %stack.0)
+    undef %0.ssub = COPY %wzr
+    INLINEASM $nop, 1, 12, implicit-def dead %d0, 12, implicit-def dead %d1, 12, implicit-def dead %d2, 12, implicit-def dead %d3, 12, implicit-def dead %d4, 12, implicit-def dead %d5, 12, implicit-def dead %d6, 12, implicit-def dead %d7, 12, implicit-def dead %d8, 12, implicit-def dead %d9, 12, implicit-def dead %d10, 12, implicit-def dead %d11, 12, implicit-def dead %d12, 12, implicit-def dead %d13, 12, implicit-def dead %d14, 12, implicit-def dead %d15, 12, implicit-def dead %d16, 12, implicit-def dead %d17, 12, implicit-def dead %d18, 12, implicit-def dead %d19, 12, implicit-def dead %d20, 12, implicit-def dead %d21, 12, implicit-def dead %d22, 12, implicit-def dead %d23, 12, implicit-def dead %d24, 12, implicit-def dead %d25, 12, implicit-def dead %d26, 12, implicit-def dead %d27, 12, implicit-def dead %d28, 12, implicit-def dead %d29, 12, implicit-def dead %d30, 12, implicit-def %d31
+    %x0 = COPY %0
+    RET_ReallyLR implicit %x0
+...
+---
+# CHECK-LABEL: name: test_subreg_fill_fold
+# Ensure that the filled COPY is eliminated and folded into the fill load.
+name:            test_subreg_fill_fold
+registers:
+  - { id: 0, class: gpr32 }
+  - { id: 1, class: gpr64 }
+body:             |
+  bb.0:
+    %0 = COPY %wzr
+    INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp
+    ; CHECK: undef %1.sub_32 = LDRWui %stack.0, 0 :: (load 4 from %stack.0)
+    undef %1.sub_32 = COPY %0
+    %x0 = COPY %1
+    RET_ReallyLR implicit %x0
+...
+---
+# CHECK-LABEL: name: test_subreg_fill_fold2
+# Similar to test_subreg_fill_fold, but with a cross-class copy.
+name:            test_subreg_fill_fold2
+registers:
+  - { id: 0, class: gpr32 }
+  - { id: 1, class: fpr64 }
+body:             |
+  bb.0:
+    %0 = COPY %wzr
+    INLINEASM $nop, 1, 12, implicit-def dead %x0, 12, implicit-def dead %x1, 12, implicit-def dead %x2, 12, implicit-def dead %x3, 12, implicit-def dead %x4, 12, implicit-def dead %x5, 12, implicit-def dead %x6, 12, implicit-def dead %x7, 12, implicit-def dead %x8, 12, implicit-def dead %x9, 12, implicit-def dead %x10, 12, implicit-def dead %x11, 12, implicit-def dead %x12, 12, implicit-def dead %x13, 12, implicit-def dead %x14, 12, implicit-def dead %x15, 12, implicit-def dead %x16, 12, implicit-def dead %x17, 12, implicit-def dead %x18, 12, implicit-def dead %x19, 12, implicit-def dead %x20, 12, implicit-def dead %x21, 12, implicit-def dead %x22, 12, implicit-def dead %x23, 12, implicit-def dead %x24, 12, implicit-def dead %x25, 12, implicit-def dead %x26, 12, implicit-def dead %x27, 12, implicit-def dead %x28, 12, implicit-def dead %fp, 12, implicit-def dead %lr, 12, implicit-def %sp
+    ; CHECK: undef %1.ssub = LDRSui %stack.0, 0 :: (load 4 from %stack.0)
+    undef %1.ssub = COPY %0
+    %d0 = COPY %1
+    RET_ReallyLR implicit %d0
+...
diff --git a/test/CodeGen/MIR/X86/basic-block-liveins.mir b/test/CodeGen/MIR/X86/basic-block-liveins.mir
index 35f5512936b..b347368a94b 100644
--- a/test/CodeGen/MIR/X86/basic-block-liveins.mir
+++ b/test/CodeGen/MIR/X86/basic-block-liveins.mir
@@ -22,7 +22,8 @@
 
 ...
 ---
-name:            test
+name: test
+tracksRegLiveness: true
 body: |
   ; CHECK-LABEL: bb.0.body:
   ; CHECK-NEXT:    liveins: %edi, %esi
@@ -33,7 +34,8 @@ body: |
     RETQ %eax
 ...
 ---
-name:            test2
+name: test2
+tracksRegLiveness: true
 body: |
   ; CHECK-LABEL: name: test2
   ; Verify that we can have multiple lists of liveins that will be merged into
@@ -48,7 +50,8 @@ body: |
     RETQ %eax
 ...
 ---
-name:            test3
+name: test3
+tracksRegLiveness: true
 body: |
   ; Verify that we can have an empty list of liveins.
   ; CHECK-LABEL: name: test3
diff --git a/test/CodeGen/MIR/X86/machine-verifier.mir b/test/CodeGen/MIR/X86/machine-verifier.mir
index c56bab8c998..7421146c22e 100644
--- a/test/CodeGen/MIR/X86/machine-verifier.mir
+++ b/test/CodeGen/MIR/X86/machine-verifier.mir
@@ -10,7 +10,8 @@
 
 ...
 ---
-name:            inc
+name: inc
+tracksRegLiveness: true
 body: |
   bb.0.entry:
     liveins: %edi
diff --git a/test/CodeGen/NVPTX/tid-range.ll b/test/CodeGen/NVPTX/tid-range.ll
new file mode 100644
index 00000000000..3dc4008810a
--- /dev/null
+++ b/test/CodeGen/NVPTX/tid-range.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+declare i32 @get_register()
+
+define i1 @test1() {
+entry:
+  %call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !0
+  %cmp = icmp eq i32 %call, 1
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: test1(
+; CHECK: setp.eq.s32  %p1, %r1, 1;
+; CHECK: selp.u32     %[[R:.+]], 1, 0, %p1;
+; CHECK: st.param.b32 [func_retval0+0], %[[R]];
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+
+!0 = !{ i32 0, i32 3 }
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
index 6cda38aa94f..425d2609380 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
@@ -24,7 +24,7 @@ define void @test_void_return() {
 ; CHECK-NEXT:   hasVAStart:      false
 ; CHECK-NEXT:   hasMustTailInVarArgFunc: false
 ; CHECK-NEXT: body:
-; CHECK-NEXT:   bb.1:
+; CHECK-NEXT:   bb.1.entry:
 ; CHECK-NEXT:     RET 0
 entry:
   ret void
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 3c649e18bc3..8590d641a4c 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -2902,6 +2902,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <
 define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
 ; CHECK-LABEL: test_mask_vextracti64x4:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    kshiftlw $12, %k1, %k0
 ; CHECK-NEXT:    kshiftrw $15, %k0, %k0
@@ -2923,7 +2924,7 @@ define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask)
 ; CHECK-NEXT:    vpmovsxdq %xmm2, %ymm2
 ; CHECK-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 1, <4 x i64> %b, i8 %mask)
   ret <4 x i64> %res
 }
 
@@ -2963,9 +2964,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x
 define <4 x double> @test_vextractf64x4(<8 x double> %a) {
 ; CHECK-LABEL: test_vextractf64x4:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
+  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1)
   ret <4 x double> %res
 }
 
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index 646697b82c2..04d21ecd3e8 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -500,3 +500,110 @@ define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
     store <8 x i8> %x, <8 x i8>* %res
     ret void
 }
+
+
+define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
+; KNL-LABEL: usat_trunc_wb_256_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vmovdqu %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_wb_256_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovuswb %ymm0, (%rdi)
+; SKX-NEXT:    retq
+  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
+
+define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
+; KNL-LABEL: usat_trunc_wb_256:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_wb_256:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovuswb %ymm0, %xmm0
+; SKX-NEXT:    retq
+  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <16 x i16> %x5 to <16 x i8>
+  ret <16 x i8> %x6
+}
+
+define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
+; KNL-LABEL: usat_trunc_wb_128_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpminuw {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: usat_trunc_wb_128_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovuswb %xmm0, (%rdi)
+; SKX-NEXT:    retq
+  %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %x6 = trunc <8 x i16> %x5 to <8 x i8>
+  store <8 x i8> %x6, <8 x i8>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
+; ALL-LABEL: usat_trunc_db_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovusdb %zmm0, (%rdi)
+; ALL-NEXT:    retq
+  %x3 = icmp ult <16 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %x6 = trunc <16 x i32> %x5 to <16 x i8>
+  store <16 x i8> %x6, <16 x i8>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
+; ALL-LABEL: usat_trunc_qb_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovusqb %zmm0, (%rdi)
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i64> %i, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+  %x6 = trunc <8 x i64> %x5 to <8 x i8>
+  store <8 x i8> %x6, <8 x i8>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
+; ALL-LABEL: usat_trunc_qd_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovusqd %zmm0, (%rdi)
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i64> %i, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+  %x6 = trunc <8 x i64> %x5 to <8 x i32>
+  store <8 x i32> %x6, <8 x i32>* %res, align 1
+  ret void
+}
+
+define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
+; ALL-LABEL: usat_trunc_qw_512_mem:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpmovusqw %zmm0, (%rdi)
+; ALL-NEXT:    retq
+  %x3 = icmp ult <8 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
+  %x6 = trunc <8 x i64> %x5 to <8 x i16>
+  store <8 x i16> %x6, <8 x i16>* %res, align 1
+  ret void
+}
+
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index a961dbac7dd..8e9bc8b5af4 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -156,3 +156,21 @@ define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
   %d = select i1 %c, i8 %a, i8 %b
   ret i8 %d
 }
+
+; FIXME: The 'not' is redundant.
+
+define i32 @smin(i32 %x) {
+; CHECK-LABEL: smin:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    xorl $-1, %edi
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    cmovsl %ecx, %eax
+; CHECK-NEXT:    retq
+  %not_x = xor i32 %x, -1
+  %1 = icmp slt i32 %not_x, -1
+  %sel = select i1 %1, i32 %not_x, i32 -1
+  ret i32 %sel
+}
+
diff --git a/test/CodeGen/X86/lower-vec-shift-2.ll b/test/CodeGen/X86/lower-vec-shift-2.ll
index 6ca76c2e7e4..a617f44d3f9 100644
--- a/test/CodeGen/X86/lower-vec-shift-2.ll
+++ b/test/CodeGen/X86/lower-vec-shift-2.ll
@@ -12,8 +12,7 @@ define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) {
 ;
 ; AVX-LABEL: test1:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -32,8 +31,7 @@ define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
 ;
 ; AVX-LABEL: test2:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -68,8 +66,7 @@ define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) {
 ;
 ; AVX-LABEL: test4:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -88,8 +85,7 @@ define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
 ;
 ; AVX-LABEL: test5:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -124,8 +120,7 @@ define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) {
 ;
 ; AVX-LABEL: test7:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -144,8 +139,7 @@ define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
 ;
 ; AVX-LABEL: test8:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/test/CodeGen/X86/shuffle-vs-trunc-128.ll
new file mode 100644
index 00000000000..f9fe97b21ee
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -0,0 +1,481 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+
+; PR31551
+; Pairs of shufflevector:trunc functions with functional equivalence.
+; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
+
+define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i8_to_v8i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v8i16_to_v8i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i16_to_v8i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i16_to_v8i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i16_to_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %bc = bitcast <16 x i8> %vec to <8 x i16>
+  %strided.vec = trunc <8 x i16> %bc to <8 x i8>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX-LABEL: shuffle_v8i16_to_v4i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX-LABEL: trunc_v4i32_to_v4i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i32_to_v4i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i32_to_v4i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i32_to_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %bc = bitcast <8 x i16> %vec to <4 x i32>
+  %strided.vec = trunc <4 x i32> %bc to <4 x i16>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
+; AVX-LABEL: shuffle_v4i32_to_v2i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqd %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqd %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <4 x i32>, <4 x i32>* %L
+  %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  store <2 x i32> %strided.vec, <2 x i32>* %S
+  ret void
+}
+
+define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
+; AVX-LABEL: trunc_v2i64_to_v2i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v2i64_to_v2i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v2i64_to_v2i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqd %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v2i64_to_v2i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqd %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <4 x i32>, <4 x i32>* %L
+  %bc = bitcast <4 x i32> %vec to <2 x i64>
+  %strided.vec = trunc <2 x i64> %bc to <2 x i32>
+  store <2 x i32> %strided.vec, <2 x i32>* %S
+  ret void
+}
+
+define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i8_to_v4i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
+
+define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v4i32_to_v4i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i32_to_v4i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i32_to_v4i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i32_to_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %bc = bitcast <16 x i8> %vec to <4 x i32>
+  %strided.vec = trunc <4 x i32> %bc to <4 x i8>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
+; AVX-LABEL: shuffle_v8i16_to_v2i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+  store <2 x i16> %strided.vec, <2 x i16>* %S
+  ret void
+}
+
+define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
+; AVX-LABEL: trunc_v2i64_to_v2i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v2i64_to_v2i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v2i64_to_v2i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i16>, <8 x i16>* %L
+  %bc = bitcast <8 x i16> %vec to <2 x i64>
+  %strided.vec = trunc <2 x i64> %bc to <2 x i16>
+  store <2 x i16> %strided.vec, <2 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i8_to_v2i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
+  store <2 x i8> %strided.vec, <2 x i8>* %S
+  ret void
+}
+
+define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v2i64_to_v2i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v2i64_to_v2i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v2i64_to_v2i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v2i64_to_v2i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i8>, <16 x i8>* %L
+  %bc = bitcast <16 x i8> %vec to <2 x i64>
+  %strided.vec = trunc <2 x i64> %bc to <2 x i8>
+  store <2 x i8> %strided.vec, <2 x i8>* %S
+  ret void
+}
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
new file mode 100644
index 00000000000..893f96e6fb2
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -0,0 +1,629 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+
+; PR31551
+; Pairs of shufflevector:trunc functions with functional equivalence.
+; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
+
+define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v32i8_to_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v16i16_to_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v16i16_to_v16i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %bc = bitcast <32 x i8> %vec to <16 x i16>
+  %strided.vec = trunc <16 x i16> %bc to <16 x i8>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i16_to_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX-LABEL: trunc_v8i32_to_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i32_to_v8i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %bc = bitcast <16 x i16> %vec to <8 x i32>
+  %strided.vec = trunc <8 x i32> %bc to <8 x i16>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
+; AVX-LABEL: shuffle_v8i32_to_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512F-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovaps (%rdi), %ymm0
+; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BWVL-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i32>, <8 x i32>* %L
+  %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x i32> %strided.vec, <4 x i32>* %S
+  ret void
+}
+
+define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
+; AVX-LABEL: trunc_v4i64_to_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqd %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqd %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <8 x i32>, <8 x i32>* %L
+  %bc = bitcast <8 x i32> %vec to <4 x i64>
+  %strided.vec = trunc <4 x i64> %bc to <4 x i32>
+  store <4 x i32> %strided.vec, <4 x i32>* %S
+  ret void
+}
+
+define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v32i8_to_v8i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v8i32_to_v8i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v8i32_to_v8i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %bc = bitcast <32 x i8> %vec to <8 x i32>
+  %strided.vec = trunc <8 x i32> %bc to <8 x i8>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX-LABEL: shuffle_v16i16_to_v4i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX-LABEL: trunc_v4i64_to_v4i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <16 x i16>, <16 x i16>* %L
+  %bc = bitcast <16 x i16> %vec to <4 x i64>
+  %strided.vec = trunc <4 x i64> %bc to <4 x i16>
+  store <4 x i16> %strided.vec, <4 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX-LABEL: shuffle_v32i8_to_v4i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
+
+define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX-LABEL: trunc_v4i64_to_v4i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovd %xmm0, (%rsi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc_v4i64_to_v4i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i8>, <32 x i8>* %L
+  %bc = bitcast <32 x i8> %vec to <4 x i64>
+  %strided.vec = trunc <4 x i64> %bc to <4 x i8>
+  store <4 x i8> %strided.vec, <4 x i8>* %S
+  ret void
+}
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
new file mode 100644
index 00000000000..923290411ae
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -0,0 +1,537 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+
+; PR31551
+; Pairs of shufflevector:trunc functions with functional equivalence.
+; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
+
+define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BWVL-NEXT:    vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+  store <32 x i8> %strided.vec, <32 x i8>* %S
+  ret void
+}
+
+define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
+; AVX512F-LABEL: trunc_v32i16_to_v32i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxwd 32(%rdi), %zmm1
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpmovsxwd (%rdi), %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovsxwd 32(%rdi), %zmm1
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %bc = bitcast <64 x i8> %vec to <32 x i16>
+  %strided.vec = trunc <32 x i16> %bc to <32 x i8>
+  store <32 x i8> %strided.vec, <32 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT:    vmovdqu {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30]
+; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
+; AVX512BWVL-NEXT:    vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  store <16 x i16> %strided.vec, <16 x i16>* %S
+  ret void
+}
+
+define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
+; AVX512-LABEL: trunc_v16i32_to_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa32 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %bc = bitcast <32 x i16> %vec to <16 x i32>
+  %strided.vec = trunc <16 x i32> %bc to <16 x i16>
+  store <16 x i16> %strided.vec, <16 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
+; AVX512-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa32 (%rdi), %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <16 x i32>, <16 x i32>* %L
+  %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i32> %strided.vec, <8 x i32>* %S
+  ret void
+}
+
+define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
+; AVX512-LABEL: trunc_v8i64_to_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <16 x i32>, <16 x i32>* %L
+  %bc = bitcast <16 x i32> %vec to <8 x i64>
+  %strided.vec = trunc <8 x i64> %bc to <8 x i32>
+  store <8 x i32> %strided.vec, <8 x i32>* %S
+  ret void
+}
+
+define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %ecx
+; AVX512BW-NEXT:    vmovd %ecx, %xmm1
+; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %ecx
+; AVX512BWVL-NEXT:    vmovd %ecx, %xmm1
+; AVX512BWVL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $12, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $12, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX512-LABEL: trunc_v16i32_to_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa32 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %bc = bitcast <64 x i8> %vec to <16 x i32>
+  %strided.vec = trunc <16 x i32> %bc to <16 x i8>
+  store <16 x i8> %strided.vec, <16 x i8>* %S
+  ret void
+}
+
+define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT:    vmovd %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrw $4, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT:    vmovd %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrw $4, %xmm2, %eax
+; AVX512BW-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX512BW-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu16 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vmovd %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vmovd %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX512-LABEL: trunc_v8i64_to_v8i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqw %zmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <32 x i16>, <32 x i16>* %L
+  %bc = bitcast <32 x i16> %vec to <8 x i64>
+  %strided.vec = trunc <8 x i64> %bc to <8 x i16>
+  store <8 x i16> %strided.vec, <8 x i16>* %S
+  ret void
+}
+
+define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm1, %r8d
+; AVX512BW-NEXT:    vpextrb $0, %xmm1, %r9d
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm1, %r10d
+; AVX512BW-NEXT:    vpextrb $0, %xmm1, %r11d
+; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; AVX512BW-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
+; AVX512BW-NEXT:    vpextrb $8, %xmm0, %edx
+; AVX512BW-NEXT:    vpextrb $0, %xmm0, %edi
+; AVX512BW-NEXT:    vpinsrb $0, %edi, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $4, %r11d, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512BWVL:       # BB#0:
+; AVX512BWVL-NEXT:    vmovdqu8 (%rdi), %zmm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX512BWVL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512BWVL-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm0
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
+
+define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512-LABEL: trunc_v8i64_to_v8i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512-NEXT:    retq
+  %vec = load <64 x i8>, <64 x i8>* %L
+  %bc = bitcast <64 x i8> %vec to <8 x i64>
+  %strided.vec = trunc <8 x i64> %bc to <8 x i8>
+  store <8 x i8> %strided.vec, <8 x i8>* %S
+  ret void
+}
diff --git a/test/CodeGen/X86/tail-call-conditional.mir b/test/CodeGen/X86/tail-call-conditional.mir
index af6e95d4610..75cb1e451d8 100644
--- a/test/CodeGen/X86/tail-call-conditional.mir
+++ b/test/CodeGen/X86/tail-call-conditional.mir
@@ -26,7 +26,8 @@
 
 ...
 ---
-name:            test
+name: test
+tracksRegLiveness: true
 liveins:
   - { reg: '%rdi' }
   - { reg: '%rsi' }
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index 50febd4c1ec..fbb67ebbf60 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -87,14 +87,12 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
 ; X32-SSE-NEXT:    psllq %xmm3, %xmm4
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
 ; X32-SSE-NEXT:    psllq %xmm1, %xmm3
 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
-; X32-SSE-NEXT:    movq {{.*#+}} xmm2 = xmm2[0],zero
 ; X32-SSE-NEXT:    psrlq %xmm2, %xmm0
 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; X32-SSE-NEXT:    orpd %xmm4, %xmm1
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index fc67914015b..27b65b82992 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -5,7 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
@@ -80,7 +80,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX512-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm3
 ; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
@@ -90,20 +90,19 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; X32-SSE-LABEL: var_shift_v2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    psrlq %xmm2, %xmm4
-; X32-SSE-NEXT:    movq {{.*#+}} xmm5 = xmm1[0],zero
-; X32-SSE-NEXT:    psrlq %xmm5, %xmm3
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X32-SSE-NEXT:    psrlq %xmm2, %xmm1
-; X32-SSE-NEXT:    psrlq %xmm5, %xmm0
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT:    xorpd %xmm4, %xmm1
-; X32-SSE-NEXT:    psubq %xmm4, %xmm1
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm4
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm3, %xmm2
+; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; X32-SSE-NEXT:    xorpd %xmm4, %xmm2
+; X32-SSE-NEXT:    psubq %xmm4, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
 ; X32-SSE-NEXT:    retl
   %shift = ashr <2 x i64> %a, %b
   ret <2 x i64> %shift
@@ -189,7 +188,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -323,11 +322,11 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v8i16:
@@ -499,7 +498,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -627,7 +626,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
@@ -637,7 +636,6 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
@@ -659,29 +657,25 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; SSE41-NEXT:    psrad %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    psrad %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v4i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v4i32:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOP-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -706,29 +700,25 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT:    psraw %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psraw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v8i16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v8i16:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -919,7 +909,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
@@ -1066,7 +1056,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -1150,7 +1140,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1232,11 +1222,11 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v8i16:
@@ -1393,7 +1383,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
@@ -1528,7 +1518,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrad $7, %xmm0, %xmm1
 ; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
@@ -1564,7 +1554,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1593,7 +1583,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1632,7 +1622,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 5725fcb8c12..ee1879b6696 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Variable Shifts
 ;
@@ -66,7 +66,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
 ; AVX512-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
 ; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -131,7 +131,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = ashr <8 x i32> %a, %b
@@ -213,11 +213,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -332,7 +332,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -411,7 +411,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
 ; AVX512-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
@@ -426,9 +426,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -436,16 +435,14 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_shift_v8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -453,15 +450,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -473,8 +468,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX1-LABEL: splatvar_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -482,16 +476,14 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; AVX2-LABEL: splatvar_shift_v16i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX1-NEXT:    vmovd %eax, %xmm1
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -499,15 +491,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX2-NEXT:    vmovd %eax, %xmm1
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -619,7 +609,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
@@ -702,7 +692,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
@@ -750,7 +740,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -815,11 +805,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
@@ -924,7 +914,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
@@ -998,7 +988,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrad $7, %ymm0, %ymm1
 ; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
@@ -1035,7 +1025,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrad $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1070,7 +1060,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsraw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1123,7 +1113,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll
index 27ff134fd10..1280641c557 100644
--- a/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Variable Shifts
 ;
 
 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: var_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <8 x i64> %a, %b
@@ -16,7 +16,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: var_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <16 x i32> %a, %b
@@ -25,7 +25,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 
 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
@@ -48,7 +48,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <32 x i16> %a, %b
@@ -57,7 +57,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 
 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -109,100 +109,100 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -212,11 +212,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -224,85 +224,85 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -310,17 +310,17 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -328,89 +328,89 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -418,86 +418,86 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -514,7 +514,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -524,9 +524,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v16i32:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; ALL:       # BB#0:
+; ALL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; ALL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -536,17 +535,15 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
 
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v32i16:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX512DQ-NEXT:    vmovd %eax, %xmm2
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsraw %xmm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsraw %xmm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v32i16:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -556,7 +553,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
@@ -602,101 +599,101 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -706,11 +703,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -718,85 +715,85 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -804,17 +801,17 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -822,89 +819,89 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -912,86 +909,86 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    sarb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -1009,7 +1006,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: constant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsravq {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
@@ -1018,7 +1015,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: constant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -1027,7 +1024,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
@@ -1049,7 +1046,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -1058,7 +1055,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512DQ-NEXT:    vpsllw $5, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
@@ -1104,7 +1101,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
 ; AVX512BW-NEXT:    vmovd %eax, %xmm2
@@ -1362,7 +1359,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 
 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsraq $7, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -1371,7 +1368,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrad $5, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1380,13 +1377,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsraw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsraw $3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1395,7 +1392,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
 ; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
@@ -1409,7 +1406,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -1422,14 +1419,14 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 
 define <64 x i8> @ashr_const7_v64i8(<64 x i8> %a) {
 ; AVX512DQ-LABEL: ashr_const7_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ashr_const7_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 0dab815d4d4..42488f2ec3a 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -5,7 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -60,7 +60,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -69,7 +69,6 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X32-SSE-NEXT:    psrlq %xmm3, %xmm2
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
 ; X32-SSE-NEXT:    movapd %xmm2, %xmm0
@@ -158,7 +157,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -292,11 +291,11 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v8i16:
@@ -417,7 +416,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
@@ -487,13 +486,12 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -511,29 +509,25 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; SSE41-NEXT:    psrld %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    psrld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v4i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v4i32:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOP-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -558,29 +552,25 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT:    psrlw %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psrlw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v8i16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v8i16:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOP-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -709,7 +699,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
@@ -810,7 +800,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -884,7 +874,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -966,11 +956,11 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v8i16:
@@ -1073,7 +1063,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm2
@@ -1145,7 +1135,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1174,7 +1164,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1203,7 +1193,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1236,7 +1226,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index 09822ee6c61..5223d7bba35 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Variable Shifts
 ;
@@ -47,7 +47,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <4 x i64> %a, %b
@@ -108,7 +108,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <8 x i32> %a, %b
@@ -190,11 +190,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -276,7 +276,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -326,7 +326,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -337,9 +337,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -347,16 +346,14 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_shift_v8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -364,15 +361,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -384,8 +379,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX1-LABEL: splatvar_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -393,16 +387,14 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; AVX2-LABEL: splatvar_shift_v16i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX1-NEXT:    vmovd %eax, %xmm1
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -410,15 +402,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX2-NEXT:    vmovd %eax, %xmm1
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -501,7 +491,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -560,7 +550,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
@@ -605,7 +595,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -670,11 +660,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
@@ -750,7 +740,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
@@ -801,7 +791,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
@@ -836,7 +826,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrld $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -871,7 +861,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -913,7 +903,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll
index 06bf12a621a..4c3caf329fb 100644
--- a/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Variable Shifts
@@ -8,7 +8,7 @@
 
 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: var_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <8 x i64> %a, %b
@@ -17,7 +17,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: var_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <16 x i32> %a, %b
@@ -26,7 +26,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 
 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
@@ -49,7 +49,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = lshr <32 x i16> %a, %b
@@ -58,7 +58,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 
 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
@@ -89,100 +89,100 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -192,11 +192,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -204,85 +204,85 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -290,17 +290,17 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -308,89 +308,89 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -398,86 +398,86 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -494,7 +494,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -504,9 +504,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v16i32:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; ALL:       # BB#0:
+; ALL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; ALL-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -516,17 +515,15 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
 
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v32i16:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX512DQ-NEXT:    vmovd %eax, %xmm2
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw %xmm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v32i16:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -536,7 +533,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -565,101 +562,101 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -669,11 +666,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -681,85 +678,85 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -767,17 +764,17 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -785,89 +782,89 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -875,86 +872,86 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shrb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -972,7 +969,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: constant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlvq {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
@@ -981,7 +978,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: constant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -990,7 +987,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
@@ -1012,7 +1009,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -1021,7 +1018,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
@@ -1050,7 +1047,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
 ; AVX512BW-NEXT:    vmovd %eax, %xmm2
@@ -1308,7 +1305,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 
 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrlq $7, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -1317,7 +1314,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsrld $5, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1326,13 +1323,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1341,7 +1338,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
 ; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
@@ -1350,7 +1347,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index ec2e61d3ca0..5c89949e924 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -5,7 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -58,7 +58,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -67,7 +67,6 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X32-SSE-NEXT:    psllq %xmm3, %xmm2
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    psllq %xmm1, %xmm0
 ; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
 ; X32-SSE-NEXT:    movapd %xmm2, %xmm0
@@ -124,7 +123,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -247,11 +246,11 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: var_shift_v8i16:
@@ -367,7 +366,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
@@ -435,13 +434,12 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: splatvar_shift_v2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-SSE-NEXT:    psllq %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -459,29 +457,25 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v4i32:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; SSE41-NEXT:    pslld %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    pslld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v4i32:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v4i32:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOP-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -506,29 +500,25 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_shift_v8i16:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT:    psllw %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT:    psllw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: splatvar_shift_v8i16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: splatvar_shift_v8i16:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOP-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -650,7 +640,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
@@ -747,7 +737,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -802,7 +792,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -838,11 +828,11 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
 ; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_shift_v8i16:
@@ -928,7 +918,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
@@ -998,7 +988,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v2i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq $7, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1027,7 +1017,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpslld $5, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1056,7 +1046,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1087,7 +1077,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index 00d04063301..eb52ae3ccac 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Variable Shifts
@@ -45,7 +45,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <4 x i64> %a, %b
@@ -89,7 +89,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <8 x i32> %a, %b
@@ -165,11 +165,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -241,7 +241,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: var_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -290,7 +290,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -301,9 +301,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: splatvar_shift_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpslld %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -311,16 +310,14 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_shift_v8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v8i32:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX1-NEXT:    vpslld %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -328,15 +325,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v8i32:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; XOPAVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v8i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX512-NEXT:    vpslld %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -348,8 +343,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX1-LABEL: splatvar_shift_v16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -357,16 +351,14 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; AVX2-LABEL: splatvar_shift_v16i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_shift_v16i16:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX1-NEXT:    vmovd %eax, %xmm1
+; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -374,15 +366,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v16i16:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpextrw $0, %xmm1, %eax
-; XOPAVX2-NEXT:    vmovd %eax, %xmm1
+; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -457,7 +447,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatvar_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
 ; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -512,7 +502,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
@@ -547,7 +537,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -582,11 +572,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v16i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512:       # BB#0:
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   ret <16 x i16> %shift
@@ -656,7 +646,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: constant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
 ; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
@@ -706,7 +696,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i64:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllq $7, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
@@ -741,7 +731,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v8i32:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpslld $5, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -776,7 +766,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v16i16:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -817,7 +807,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; XOPAVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: splatconstant_shift_v32i8:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll
index eb1309d9bb0..520c3237a57 100644
--- a/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Variable Shifts
@@ -8,7 +8,7 @@
 
 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: var_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <8 x i64> %a, %b
@@ -17,7 +17,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: var_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <16 x i32> %a, %b
@@ -26,7 +26,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 
 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpxor %ymm4, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
@@ -49,7 +49,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = shl <32 x i16> %a, %b
@@ -58,7 +58,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 
 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: var_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512DQ-NEXT:    vpand %ymm5, %ymm4, %ymm4
@@ -86,100 +86,100 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -189,11 +189,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -201,85 +201,85 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -287,17 +287,17 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -305,89 +305,89 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -395,86 +395,86 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -491,7 +491,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -501,9 +501,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 
 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; ALL-LABEL: splatvar_shift_v16i32:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; ALL:       # BB#0:
+; ALL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; ALL-NEXT:    vpslld %xmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -513,17 +512,15 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
 
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v32i16:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    vpextrw $0, %xmm2, %eax
-; AVX512DQ-NEXT:    vmovd %eax, %xmm2
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512DQ-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsllw %xmm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v32i16:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512BW-NEXT:    vmovd %eax, %xmm1
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -533,7 +530,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-LABEL: splatvar_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
@@ -559,101 +556,101 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; AVX512BW-NEXT:    vpextrb $1, %xmm2, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $3, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $5, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $6, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $7, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $9, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $10, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $11, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $13, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $14, %xmm2, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $15, %xmm2, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm2
@@ -663,11 +660,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
@@ -675,85 +672,85 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
@@ -761,17 +758,17 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpextrb $1, %xmm3, %eax
 ; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
 ; AVX512BW-NEXT:    vpextrb $1, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $2, %xmm3, %esi
 ; AVX512BW-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %sil
 ; AVX512BW-NEXT:    movzbl %dl, %ecx
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm5
@@ -779,89 +776,89 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    movzbl %sil, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $3, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $4, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $5, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $5, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $6, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $7, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $8, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $9, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $9, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $10, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $11, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $12, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpextrb $13, %xmm3, %eax
 ; AVX512BW-NEXT:    vpextrb $13, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $14, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm3, %edx
 ; AVX512BW-NEXT:    vpextrb $15, %xmm4, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $1, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $1, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %esi
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %sil
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
 ; AVX512BW-NEXT:    movzbl %dl, %eax
@@ -869,86 +866,86 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vmovd %ecx, %xmm4
 ; AVX512BW-NEXT:    vpextrb $2, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $3, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $4, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $5, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $6, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $7, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $9, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $10, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $11, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    vpextrb $13, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpextrb $14, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %dl
 ; AVX512BW-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    movzbl %dl, %eax
 ; AVX512BW-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpextrb $15, %xmm0, %eax
 ; AVX512BW-NEXT:    vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT:    ## kill: %CL<def> %CL<kill> %ECX<kill>
+; AVX512BW-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
 ; AVX512BW-NEXT:    shlb %cl, %al
 ; AVX512BW-NEXT:    movzbl %al, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm0
@@ -966,7 +963,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 
 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: constant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllvq {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
@@ -975,7 +972,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: constant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -984,14 +981,14 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -1000,7 +997,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512DQ-NEXT:    vpand %ymm3, %ymm2, %ymm2
@@ -1026,7 +1023,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %eax
 ; AVX512BW-NEXT:    vmovd %eax, %xmm2
@@ -1284,7 +1281,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 
 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v8i64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpsllq $7, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -1293,7 +1290,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
 
 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: splatconstant_shift_v16i32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpslld $5, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1302,13 +1299,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
 
 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v32i16:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsllw $3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v32i16:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1317,7 +1314,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
 
 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-LABEL: splatconstant_shift_v64i8:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
 ; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
@@ -1326,7 +1323,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_shift_v64i8:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index cad8f85395d..2aab77433df 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -702,17 +702,11 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSE41-NEXT:    pinsrb $5, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i8> %shuffle
@@ -739,17 +733,11 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(
 ; SSE41-NEXT:    pinsrb $15, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
   ret <16 x i8> %shuffle
@@ -776,17 +764,11 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 3
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %shuffle
@@ -1222,19 +1204,12 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
 ; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; AVX1OR2:       # BB#0: # %entry
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
-; AVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; AVX512VL:       # BB#0: # %entry
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
-; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
 
@@ -1771,21 +1746,13 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: PR31364:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: PR31364:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: PR31364:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
+; AVX-NEXT:    retq
   %v0 = load i8, i8* %a, align 1
   %vecins = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %v0, i32 0
   %v1 = load i8, i8* %b, align 1
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 4270d3d216c..3e9e980a197 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1354,19 +1354,12 @@ define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i32_z6zz:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_z6zz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v4i32_z6zz:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2OR512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
   ret <4 x i32> %shuffle
 }
@@ -1683,17 +1676,11 @@ define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i32_0z23:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_0z23:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v4i32_0z23:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
   ret <4 x i32> %shuffle
 }
@@ -1726,17 +1713,11 @@ define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i32_01z3:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_01z3:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v4i32_01z3:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
   ret <4 x i32> %shuffle
 }
@@ -1769,17 +1750,11 @@ define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i32_012z:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_012z:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v4i32_012z:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   ret <4 x i32> %shuffle
 }
@@ -1812,17 +1787,11 @@ define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4i32_0zz3:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v4i32_0zz3:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX512VL-NEXT:    retq
+; AVX2OR512VL-LABEL: shuffle_v4i32_0zz3:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
   ret <4 x i32> %shuffle
 }
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 2421b2b579c..ac9db62d3c1 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1415,17 +1415,11 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
 ; SSE-NEXT:    pinsrw $1, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_z8zzzzzz:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_z8zzzzzz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_z8zzzzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
   ret <8 x i16> %shuffle
@@ -1438,17 +1432,11 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
 ; SSE-NEXT:    pinsrw $5, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_zzzzz8zz:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_zzzzz8zz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_zzzzz8zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
   ret <8 x i16> %shuffle
@@ -1461,17 +1449,11 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
 ; SSE-NEXT:    pinsrw $7, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_zuuzuuz8:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_zuuzuuz8:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
   ret <8 x i16> %shuffle
@@ -1484,17 +1466,11 @@ define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
 ; SSE-NEXT:    pinsrw $2, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_zzBzzzzz:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1OR2-NEXT:    vpinsrw $2, %edi, %xmm0, %xmm0
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_zzBzzzzz:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrw $2, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_zzBzzzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $2, %edi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 3
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %shuffle
@@ -2102,17 +2078,11 @@ define <8 x i16> @shuffle_v8i16_0z234567(<8 x i16> %a) {
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_0z234567:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_0z234567:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_0z234567:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %shuffle
 }
@@ -2134,17 +2104,11 @@ define <8 x i16> @shuffle_v8i16_0zzzz5z7(<8 x i16> %a) {
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_0zzzz5z7:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_0zzzz5z7:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_0zzzz5z7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 5, i32 8, i32 7>
   ret <8 x i16> %shuffle
 }
@@ -2166,17 +2130,11 @@ define <8 x i16> @shuffle_v8i16_0123456z(<8 x i16> %a) {
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1OR2-LABEL: shuffle_v8i16_0123456z:
-; AVX1OR2:       # BB#0:
-; AVX1OR2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_0123456z:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
-; AVX512VL-NEXT:    retq
+; AVX-LABEL: shuffle_v8i16_0123456z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
   ret <8 x i16> %shuffle
 }
diff --git a/test/CodeGen/X86/vector-shuffle-masked.ll b/test/CodeGen/X86/vector-shuffle-masked.ll
index 2837c28a484..04d6b373324 100644
--- a/test/CodeGen/X86/vector-shuffle-masked.ll
+++ b/test/CodeGen/X86/vector-shuffle-masked.ll
@@ -236,3 +236,453 @@ define <8 x i32> @maskz_shuffle_v8i32_23456701(<8 x i32> %a, i8 %mask) {
   %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
+
+define <4 x i32> @mask_extract_v16i32_v4i32_0(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v4i32_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $0, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @mask_extract_v16i32_v4i32_1(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v4i32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @mask_extract_v16i32_v4i32_2(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v4i32_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @mask_extract_v16i32_v4i32_3(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v4i32_3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $3, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru
+  ret <4 x i32> %res
+}
+
+define <4 x float> @mask_extract_v16f32_v4f32_0(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v4f32_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $0, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru
+  ret <4 x float> %res
+}
+
+define <4 x float> @mask_extract_v16f32_v4f32_1(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v4f32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru
+  ret <4 x float> %res
+}
+
+define <4 x float> @mask_extract_v16f32_v4f32_2(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v4f32_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru
+  ret <4 x float> %res
+}
+
+define <4 x float> @mask_extract_v16f32_v4f32_3(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v4f32_3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru
+  ret <4 x float> %res
+}
+
+define <8 x i32> @mask_extract_v16i32_v8i32_0(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v8i32_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x8 $0, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> %passthru
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @mask_extract_v16i32_v8i32_1(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v8i32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> %passthru
+  ret <8 x i32> %res
+}
+
+define <8 x float> @mask_extract_v16f32_v8f32_0(<16 x float> %a, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v8f32_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x8 $0, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x float> %shuffle, <8 x float> %passthru
+  ret <8 x float> %res
+}
+
+define <8 x float> @mask_extract_v16f32_v8f32_1(<16 x float> %a, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v8f32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x float> %shuffle, <8 x float> %passthru
+  ret <8 x float> %res
+}
+
+define <2 x i64> @mask_extract_v8i64_v2i64_0(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v2i64_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x2 $0, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mask_extract_v8i64_v2i64_1(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v2i64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x2 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mask_extract_v8i64_v2i64_2(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v2i64_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x2 $2, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mask_extract_v8i64_v2i64_3(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v2i64_3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x2 $3, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru
+  ret <2 x i64> %res
+}
+
+define <2 x double> @mask_extract_v8f64_v2f64_0(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v2f64_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $0, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru
+  ret <2 x double> %res
+}
+
+define <2 x double> @mask_extract_v8f64_v2f64_1(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v2f64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru
+  ret <2 x double> %res
+}
+
+define <2 x double> @mask_extract_v8f64_v2f64_2(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v2f64_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $2, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru
+  ret <2 x double> %res
+}
+
+define <2 x double> @mask_extract_v8f64_v2f64_3(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v2f64_3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $3, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru
+  ret <2 x double> %res
+}
+
+define <4 x i64> @mask_extract_v8i64_v4i64_0(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v4i64_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x4 $0, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle, <4 x i64> %passthru
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @mask_extract_v8i64_v4i64_1(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v4i64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle, <4 x i64> %passthru
+  ret <4 x i64> %res
+}
+
+define <4 x double> @mask_extract_v8f64_v4f64_0(<8 x double> %a, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v4f64_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x4 $0, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x double> %shuffle, <4 x double> %passthru
+  ret <4 x double> %res
+}
+
+define <4 x double> @mask_extract_v8f64_v4f64_1(<8 x double> %a, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v4f64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x double> %shuffle, <4 x double> %passthru
+  ret <4 x double> %res
+}
+
+define <8 x i32> @mask_extract_v8i64_v8i32_1(<8 x i64> %a, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v8i32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.cast = bitcast <4 x i64> %shuffle to <8 x i32>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle.cast, <8 x i32> %passthru
+  ret <8 x i32> %res
+}
+
+define <8 x float> @mask_extract_v8f64_v8f32_1(<8 x double> %a, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v8f32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.cast = bitcast <4 x double> %shuffle to <8 x float>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x float> %shuffle.cast, <8 x float> %passthru
+  ret <8 x float> %res
+}
+
+define <4 x i32> @mask_cast_extract_v8i64_v4i32_1(<8 x i64> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8i64_v4i32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti32x4 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.cast = bitcast <2 x i64> %shuffle to <4 x i32>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle.cast, <4 x i32> %passthru
+  ret <4 x i32> %res
+}
+
+define <4 x float> @mask_cast_extract_v8f64_v4f32_1(<8 x double> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8f64_v4f32_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf32x4 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.cast = bitcast <2 x double> %shuffle to <4 x float>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x float> %shuffle.cast, <4 x float> %passthru
+  ret <4 x float> %res
+}
+
+define <4 x i64> @mask_cast_extract_v16i32_v4i64_1(<16 x i32> %a, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16i32_v4i64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.cast = bitcast <8 x i32> %shuffle to <4 x i64>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle.cast, <4 x i64> %passthru
+  ret <4 x i64> %res
+}
+
+define <4 x double> @mask_cast_extract_v16f32_v4f64_1(<16 x float> %a, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16f32_v4f64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.cast = bitcast <8 x float> %shuffle to <4 x double>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = select <4 x i1> %mask.extract, <4 x double> %shuffle.cast, <4 x double> %passthru
+  ret <4 x double> %res
+}
+
+define <2 x i64> @mask_cast_extract_v16i32_v2i64_1(<16 x i32> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16i32_v2i64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextracti64x2 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.cast = bitcast <4 x i32> %shuffle to <2 x i64>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle.cast, <2 x i64> %passthru
+  ret <2 x i64> %res
+}
+
+define <2 x double> @mask_cast_extract_v16f32_v2f64_1(<16 x float> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16f32_v2f64_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.cast = bitcast <4 x float> %shuffle to <2 x double>
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %res = select <2 x i1> %mask.extract, <2 x double> %shuffle.cast, <2 x double> %passthru
+  ret <2 x double> %res
+}
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 2f5e177badc..bf32e672138 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -100,73 +100,22 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE41-NEXT:    psadbw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: testv2i64:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: testv2i64:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512CDVL-LABEL: testv2i64:
-; AVX512CDVL:       # BB#0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    retq
-;
-; AVX512CD-LABEL: testv2i64:
-; AVX512CD:       # BB#0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    retq
+; AVX-LABEL: testv2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
+; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv2i64:
 ; X32-SSE:       # BB#0:
@@ -873,81 +822,24 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: testv8i16:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: testv8i16:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512CDVL-LABEL: testv8i16:
-; AVX512CDVL:       # BB#0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX512CDVL-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX512CDVL-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    retq
-;
-; AVX512CD-LABEL: testv8i16:
-; AVX512CD:       # BB#0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX512CD-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX512CD-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512CD-NEXT:    retq
+; AVX-LABEL: testv8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv8i16:
 ; X32-SSE:       # BB#0:
@@ -1071,81 +963,24 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: testv8i16u:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: testv8i16u:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512CDVL-LABEL: testv8i16u:
-; AVX512CDVL:       # BB#0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX512CDVL-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX512CDVL-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    retq
-;
-; AVX512CD-LABEL: testv8i16u:
-; AVX512CD:       # BB#0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsllw $8, %xmm0, %xmm1
-; AVX512CD-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX512CD-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX512CD-NEXT:    retq
+; AVX-LABEL: testv8i16u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv8i16u:
 ; X32-SSE:       # BB#0:
@@ -1253,69 +1088,21 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; SSE41-NEXT:    paddb %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: testv16i8:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: testv16i8:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512CDVL-LABEL: testv16i8:
-; AVX512CDVL:       # BB#0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    retq
-;
-; AVX512CD-LABEL: testv16i8:
-; AVX512CD:       # BB#0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    retq
+; AVX-LABEL: testv16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv16i8:
 ; X32-SSE:       # BB#0:
@@ -1419,69 +1206,21 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; SSE41-NEXT:    paddb %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: testv16i8u:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: testv16i8u:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512CDVL-LABEL: testv16i8u:
-; AVX512CDVL:       # BB#0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    retq
-;
-; AVX512CD-LABEL: testv16i8u:
-; AVX512CD:       # BB#0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    retq
+; AVX-LABEL: testv16i8u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv16i8u:
 ; X32-SSE:       # BB#0:
diff --git a/test/CodeGen/X86/vshift-4.ll b/test/CodeGen/X86/vshift-4.ll
index 4e5fb60fae8..5d486e79405 100644
--- a/test/CodeGen/X86/vshift-4.ll
+++ b/test/CodeGen/X86/vshift-4.ll
@@ -9,7 +9,6 @@ define void @shift1a(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
 ; X32-LABEL: shift1a:
 ; X32:       # BB#0: # %entry
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-NEXT:    psllq %xmm1, %xmm0
 ; X32-NEXT:    movdqa %xmm0, (%eax)
 ; X32-NEXT:    retl
@@ -34,7 +33,6 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
 ; X32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
 ; X32-NEXT:    movdqa %xmm0, %xmm3
 ; X32-NEXT:    psllq %xmm2, %xmm3
-; X32-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
 ; X32-NEXT:    psllq %xmm1, %xmm0
 ; X32-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
 ; X32-NEXT:    movapd %xmm3, (%eax)
diff --git a/test/DebugInfo/Generic/licm-hoist-debug-loc.ll b/test/DebugInfo/Generic/licm-hoist-debug-loc.ll
new file mode 100644
index 00000000000..c42396d9059
--- /dev/null
+++ b/test/DebugInfo/Generic/licm-hoist-debug-loc.ll
@@ -0,0 +1,75 @@
+; RUN: opt -S -licm %s | FileCheck %s
+;
+; LICM should null out debug locations when it hoists instructions out of a loop.
+;
+; Generated with
+; clang -O0 -S -emit-llvm test.cpp -g -gline-tables-only -o t.ll
+; opt -S -sroa -adce -simplifycfg -reassociate -domtree -loops \
+;     -loop-simplify -lcssa -basicaa -aa -scalar-evolution -loop-rotate t.ll > test.ll
+;
+; void bar(int *);
+; void foo(int k, int p)
+; {
+;    for (int i = 0; i < k; i++) {
+;      bar(&p + 4);
+;    }
+; }
+;
+; We make sure that the instruction that is hoisted into the preheader
+; does not have a debug location.
+; CHECK: for.body.lr.ph:
+; CHECK: getelementptr{{.*}}%p.addr, i64 4{{$}}
+; CHECK: for.body:
+;
+; ModuleID = 't.ll'
+source_filename = "test.c"
+
+; Function Attrs: nounwind sspstrong uwtable
+define void @foo(i32 %k, i32 %p) !dbg !7 {
+entry:
+  %p.addr = alloca i32, align 4
+  store i32 %p, i32* %p.addr, align 4
+  %cmp2 = icmp slt i32 0, %k, !dbg !9
+  br i1 %cmp2, label %for.body.lr.ph, label %for.end, !dbg !9
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body, !dbg !9
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %add.ptr = getelementptr inbounds i32, i32* %p.addr, i64 4, !dbg !11
+  call void @bar(i32* %add.ptr), !dbg !11
+  %inc = add nsw i32 %i.03, 1, !dbg !12
+  %cmp = icmp slt i32 %inc, %k, !dbg !9
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !dbg !9, !llvm.loop !14
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end, !dbg !9
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void, !dbg !16
+}
+
+declare void @bar(i32*)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.1 (PS4 clang version 4.50.0.249 7e7cd823 checking)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "D:\test")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 3.9.1 (PS4 clang version 4.50.0.249 7e7cd823 checking)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 4, scope: !10)
+!10 = !DILexicalBlockFile(scope: !7, file: !1, discriminator: 1)
+!11 = !DILocation(line: 5, scope: !7)
+!12 = !DILocation(line: 4, scope: !13)
+!13 = !DILexicalBlockFile(scope: !7, file: !1, discriminator: 2)
+!14 = distinct !{!14, !15}
+!15 = !DILocation(line: 4, scope: !7)
+!16 = !DILocation(line: 7, scope: !7)
diff --git a/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll b/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
index 0667685befc..ddfd7ca7c36 100644
--- a/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
+++ b/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
@@ -73,7 +73,43 @@ define void @store.v4i64.0001(<4 x i32*> %arg) sanitize_address {
 define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) sanitize_address {
 ; ALL-LABEL: @store.v4f32.variable
   %p = load <4 x float>*, <4 x float>** @v4f32, align 8
-; ALL-NOT: call void @__asan_store
+; STORE: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0
+; STORE: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]]
+; STORE: <label>:[[THEN0]]:
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP0]])
+; STORE: br label %[[AFTER0]]
+; STORE: <label>:[[AFTER0]]
+
+; STORE: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1
+; STORE: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]]
+; STORE: <label>:[[THEN1]]:
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP1]])
+; STORE: br label %[[AFTER1]]
+; STORE: <label>:[[AFTER1]]
+
+; STORE: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2
+; STORE: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]]
+; STORE: <label>:[[THEN2]]:
+; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP2]])
+; STORE: br label %[[AFTER2]]
+; STORE: <label>:[[AFTER2]]
+
+; STORE: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3
+; STORE: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]]
+; STORE: <label>:[[THEN3]]:
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; STORE: call void @__asan_store4(i64 [[PGEP3]])
+; STORE: br label %[[AFTER3]]
+; STORE: <label>:[[AFTER3]]
+
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
   tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
   ret void
 }
@@ -165,7 +201,43 @@ define <4 x i32*> @load.v4i64.0001(<4 x i32*> %arg) sanitize_address {
 define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) sanitize_address {
 ; ALL-LABEL: @load.v4f32.variable
   %p = load <4 x float>*, <4 x float>** @v4f32, align 8
-; ALL-NOT: call void @__asan_load
+; LOAD: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0
+; LOAD: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]]
+; LOAD: <label>:[[THEN0]]:
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP0]])
+; LOAD: br label %[[AFTER0]]
+; LOAD: <label>:[[AFTER0]]
+
+; LOAD: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1
+; LOAD: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]]
+; LOAD: <label>:[[THEN1]]:
+; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP1]])
+; LOAD: br label %[[AFTER1]]
+; LOAD: <label>:[[AFTER1]]
+
+; LOAD: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2
+; LOAD: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]]
+; LOAD: <label>:[[THEN2]]:
+; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP2]])
+; LOAD: br label %[[AFTER2]]
+; LOAD: <label>:[[AFTER2]]
+
+; LOAD: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3
+; LOAD: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]]
+; LOAD: <label>:[[THEN3]]:
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__asan_load4(i64 [[PGEP3]])
+; LOAD: br label %[[AFTER3]]
+; LOAD: <label>:[[AFTER3]]
+
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
   ret <4 x float> %res
 }
diff --git a/test/MC/AsmParser/Inputs/function.x b/test/MC/AsmParser/Inputs/function.x
new file mode 100644
index 00000000000..582bfdfa627
--- /dev/null
+++ b/test/MC/AsmParser/Inputs/function.x
@@ -0,0 +1,3 @@
+
+FUNCTION = 1
+
diff --git a/test/MC/AsmParser/Inputs/module.x b/test/MC/AsmParser/Inputs/module.x
new file mode 100644
index 00000000000..e93c615ef14
--- /dev/null
+++ b/test/MC/AsmParser/Inputs/module.x
@@ -0,0 +1,3 @@
+
+MODULE = 1
+
diff --git a/test/MC/AsmParser/include.ll b/test/MC/AsmParser/include.ll
new file mode 100644
index 00000000000..390041ff330
--- /dev/null
+++ b/test/MC/AsmParser/include.ll
@@ -0,0 +1,13 @@
+; RUN: llc -I %p/Inputs -filetype asm -o - %s | FileCheck %s
+
+module asm ".include \22module.x\22"
+
+define arm_aapcscc void @f() {
+entry:
+  call void asm sideeffect ".include \22function.x\22", ""()
+  ret void
+}
+
+; CHECK: MODULE = 1
+; CHECK: FUNCTION = 1
+
diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt
index 0487e3fdfc2..8c8ce65567a 100644
--- a/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-fp.txt
@@ -231,6 +231,12 @@
 # CHECK: fctid. 2, 3                     
 0xfc 0x40 0x1e 0x5d
 
+# CHECK: fctidu 2, 3
+0xfc 0x40 0x1f 0x5c
+
+# CHECK: fctidu. 2, 3
+0xfc 0x40 0x1f 0x5d
+
 # CHECK: fctidz 2, 3                     
 0xfc 0x40 0x1e 0x5e
 
@@ -249,6 +255,12 @@
 # CHECK: fctiw. 2, 3                     
 0xfc 0x40 0x18 0x1d
 
+# CHECK: fctiwu 2, 3
+0xfc 0x40 0x19 0x1c
+
+# CHECK: fctiwu. 2, 3
+0xfc 0x40 0x19 0x1d
+
 # CHECK: fctiwz 2, 3                     
 0xfc 0x40 0x18 0x1e
 
@@ -309,6 +321,12 @@
 # CHECK: frim. 2, 3                      
 0xfc 0x40 0x1b 0xd1
 
+# CHECK: ftdiv 2, 3, 4
+0xfd 0x03 0x21 0x00
+
+#CHECK: ftsqrt 2, 3
+0xfd,0x00,0x19,0x40
+
 # CHECK: fcmpu 2, 3, 4                   
 0xfd 0x03 0x20 0x00
 
diff --git a/test/MC/PowerPC/ppc64-encoding-fp.s b/test/MC/PowerPC/ppc64-encoding-fp.s
index 48384845715..0e74840ca16 100644
--- a/test/MC/PowerPC/ppc64-encoding-fp.s
+++ b/test/MC/PowerPC/ppc64-encoding-fp.s
@@ -188,8 +188,14 @@
 # CHECK-BE: frsqrtes. 2, 3                  # encoding: [0xec,0x40,0x18,0x35]
 # CHECK-LE: frsqrtes. 2, 3                  # encoding: [0x35,0x18,0x40,0xec]
             frsqrtes. 2, 3
-# FIXME:    ftdiv 2, 3, 4
-# FIXME:    ftsqrt 2, 3, 4
+
+# CHECK-BE: ftdiv 2, 3, 4                   # encoding: [0xfd,0x03,0x21,0x00]
+# CHECK-LE: ftdiv 2, 3, 4                   # encoding: [0x00,0x21,0x03,0xfd]
+            ftdiv 2, 3, 4
+
+# CHECK-BE: ftsqrt 2, 3                    # encoding: [0xfd,0x00,0x19,0x40]
+# CHECK-LE: ftsqrt 2, 3                    # encoding: [0x40,0x19,0x00,0xfd]
+            ftsqrt 2, 3
 
 # CHECK-BE: fmadd 2, 3, 4, 5                # encoding: [0xfc,0x43,0x29,0x3a]
 # CHECK-LE: fmadd 2, 3, 4, 5                # encoding: [0x3a,0x29,0x43,0xfc]
@@ -255,34 +261,48 @@
 # CHECK-BE: fctid. 2, 3                     # encoding: [0xfc,0x40,0x1e,0x5d]
 # CHECK-LE: fctid. 2, 3                     # encoding: [0x5d,0x1e,0x40,0xfc]
             fctid. 2, 3
+
+# CHECK-BE: fctidu 2, 3                      # encoding: [0xfc,0x40,0x1f,0x5c]
+# CHECK-LE: fctidu 2, 3                      # encoding: [0x5c,0x1f,0x40,0xfc]
+            fctidu 2, 3
+# CHECK-BE: fctidu. 2, 3                     # encoding: [0xfc,0x40,0x1f,0x5d]
+# CHECK-LE: fctidu. 2, 3                     # encoding: [0x5d,0x1f,0x40,0xfc]
+            fctidu. 2, 3
+
 # CHECK-BE: fctidz 2, 3                     # encoding: [0xfc,0x40,0x1e,0x5e]
 # CHECK-LE: fctidz 2, 3                     # encoding: [0x5e,0x1e,0x40,0xfc]
             fctidz 2, 3
 # CHECK-BE: fctidz. 2, 3                    # encoding: [0xfc,0x40,0x1e,0x5f]
 # CHECK-LE: fctidz. 2, 3                    # encoding: [0x5f,0x1e,0x40,0xfc]
             fctidz. 2, 3
-# FIXME:    fctidu 2, 3
-# FIXME:    fctidu. 2, 3
+
 # CHECK-BE: fctiduz 2, 3                    # encoding: [0xfc,0x40,0x1f,0x5e]
 # CHECK-LE: fctiduz 2, 3                    # encoding: [0x5e,0x1f,0x40,0xfc]
             fctiduz 2, 3
 # CHECK-BE: fctiduz. 2, 3                   # encoding: [0xfc,0x40,0x1f,0x5f]
 # CHECK-LE: fctiduz. 2, 3                   # encoding: [0x5f,0x1f,0x40,0xfc]
             fctiduz. 2, 3
+
 # CHECK-BE: fctiw 2, 3                      # encoding: [0xfc,0x40,0x18,0x1c]
 # CHECK-LE: fctiw 2, 3                      # encoding: [0x1c,0x18,0x40,0xfc]
             fctiw 2, 3
 # CHECK-BE: fctiw. 2, 3                     # encoding: [0xfc,0x40,0x18,0x1d]
 # CHECK-LE: fctiw. 2, 3                     # encoding: [0x1d,0x18,0x40,0xfc]
             fctiw. 2, 3
+
+# CHECK-BE: fctiwu 2, 3                      # encoding: [0xfc,0x40,0x19,0x1c]
+# CHECK-LE: fctiwu 2, 3                      # encoding: [0x1c,0x19,0x40,0xfc]
+            fctiwu 2, 3
+# CHECK-BE: fctiwu. 2, 3                     # encoding: [0xfc,0x40,0x19,0x1d]
+# CHECK-LE: fctiwu. 2, 3                     # encoding: [0x1d,0x19,0x40,0xfc]
+            fctiwu. 2, 3
+
 # CHECK-BE: fctiwz 2, 3                     # encoding: [0xfc,0x40,0x18,0x1e]
 # CHECK-LE: fctiwz 2, 3                     # encoding: [0x1e,0x18,0x40,0xfc]
             fctiwz 2, 3
 # CHECK-BE: fctiwz. 2, 3                    # encoding: [0xfc,0x40,0x18,0x1f]
 # CHECK-LE: fctiwz. 2, 3                    # encoding: [0x1f,0x18,0x40,0xfc]
             fctiwz. 2, 3
-# FIXME:    fctiwu 2, 3
-# FIXME:    fctiwu. 2, 3
 # CHECK-BE: fctiwuz 2, 3                    # encoding: [0xfc,0x40,0x19,0x1e]
 # CHECK-LE: fctiwuz 2, 3                    # encoding: [0x1e,0x19,0x40,0xfc]
             fctiwuz 2, 3
diff --git a/test/ThinLTO/X86/Inputs/deadstrip.ll b/test/ThinLTO/X86/Inputs/deadstrip.ll
new file mode 100644
index 00000000000..a9161a31b24
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/deadstrip.ll
@@ -0,0 +1,22 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare void @dead_func()
+
+; Called from a @dead_func() in the other file, should not be imported there
+; Ensure the cycle formed by calling @dead_func doesn't prevent stripping.
+define void @baz() {
+    call void @dead_func()
+    ret void
+}
+
+; Called via llvm.global_ctors, should be detected as live via the
+; marking of llvm.global_ctors as a live root in the index.
+define void @boo() {
+  ret void
+}
+
+define void @another_dead_func() {
+    call void @dead_func()
+    ret void
+}
diff --git a/test/ThinLTO/X86/Inputs/lazyload_metadata.ll b/test/ThinLTO/X86/Inputs/lazyload_metadata.ll
new file mode 100644
index 00000000000..f51a08a1a3f
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/lazyload_metadata.ll
@@ -0,0 +1,12 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare void @globalfunc1()
+
+
+define i32 @main() {
+	call void @globalfunc1()
+	ret i32 0
+}
+
+
diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll
new file mode 100644
index 00000000000..6f1cbfe5969
--- /dev/null
+++ b/test/ThinLTO/X86/deadstrip.ll
@@ -0,0 +1,109 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/deadstrip.ll -o %t2.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t.index.bc %t1.bc %t2.bc
+
+; RUN: llvm-lto -exported-symbol=_main -thinlto-action=promote %t1.bc -thinlto-index=%t.index.bc -o - | llvm-lto -exported-symbol=_main -thinlto-action=internalize -thinlto-index %t.index.bc -thinlto-module-id=%t1.bc - -o - | llvm-dis -o - | FileCheck %s
+; RUN: llvm-lto -exported-symbol=_main -thinlto-action=promote %t2.bc -thinlto-index=%t.index.bc -o - | llvm-lto -exported-symbol=_main -thinlto-action=internalize -thinlto-index %t.index.bc -thinlto-module-id=%t2.bc - -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK2
+
+; RUN: llvm-lto -exported-symbol=_main -thinlto-action=run %t1.bc %t2.bc
+; RUN: llvm-nm %t1.bc.thinlto.o | FileCheck %s --check-prefix=CHECK-NM
+
+; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.out -save-temps \
+; RUN:   -r %t1.bc,_main,plx \
+; RUN:   -r %t1.bc,_bar,pl \
+; RUN:   -r %t1.bc,_dead_func,pl \
+; RUN:   -r %t1.bc,_baz,l \
+; RUN:   -r %t1.bc,_boo,l \
+; RUN:   -r %t2.bc,_baz,pl \
+; RUN:   -r %t2.bc,_boo,pl \
+; RUN:   -r %t2.bc,_dead_func,pl \
+; RUN:   -r %t2.bc,_another_dead_func,pl
+; RUN: llvm-dis < %t.out.0.3.import.bc | FileCheck %s
+; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2
+; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM
+
+; Dead-stripping on the index allows to internalize these,
+; and limit the import of @baz thanks to early pruning.
+; CHECK-NOT: available_externally {{.*}} @baz()
+; CHECK: @llvm.global_ctors =
+; CHECK: define internal void @_GLOBAL__I_a()
+; CHECK: define internal void @bar() {
+; CHECK: define internal void @bar_internal()
+; CHECK: define internal void @dead_func() {
+; CHECK-NOT: available_externally {{.*}} @baz()
+
+; Make sure we didn't internalize @boo, which is reachable via
+; llvm.global_ctors
+; CHECK2: define void @boo()
+; We should have eventually revoved @baz since it was internalized and unused
+; CHECK2-NM-NOT: _baz
+
+; The final binary should not contain any of the dead functions,
+; only main is expected because bar is expected to be inlined and stripped out.
+; CHECK-NM-NOT: bar
+; CHECK-NM-NOT: dead
+; CHECK-NM: T _main
+; CHECK-NM-NOT: bar
+; CHECK-NM-NOT: dead
+
+; Next test the case where Inputs/deadstrip.ll does not get a module index,
+; which will cause it to be handled by regular LTO in the new LTO API.
+; In that case there are uses of @dead_func in the regular LTO partition
+; and it shouldn't be internalized.
+; RUN: opt %p/Inputs/deadstrip.ll -o %t3.bc
+; RUN: llvm-lto2 %t1.bc %t3.bc -o %t4.out -save-temps \
+; RUN:   -r %t1.bc,_main,plx \
+; RUN:   -r %t1.bc,_bar,pl \
+; RUN:   -r %t1.bc,_dead_func,pl \
+; RUN:   -r %t1.bc,_baz,l \
+; RUN:   -r %t1.bc,_boo,l \
+; RUN:   -r %t3.bc,_baz,pl \
+; RUN:   -r %t3.bc,_boo,pl \
+; RUN:   -r %t3.bc,_dead_func,pl \
+; RUN:   -r %t3.bc,_another_dead_func,pl
+; RUN: llvm-dis < %t4.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK-NOTDEAD
+; RUN: llvm-nm %t4.out.0 | FileCheck %s --check-prefix=CHECK-NM-NOTDEAD
+
+; We can't internalize @dead_func because of the use in the regular LTO
+; partition.
+; CHECK-NOTDEAD: define void @dead_func()
+; We also can't eliminate @baz because it is in the regular LTO partition
+; and called from @dead_func.
+; CHECK-NM-NOTDEAD: T _baz
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+
+declare void @baz()
+
+declare void @boo()
+
+define internal void @_GLOBAL__I_a() #1 section "__TEXT,__StaticInit,regular,pure_instructions" {
+entry:
+    call void @boo()
+    ret void
+}
+
+define void @bar() {
+    ret void
+}
+
+define internal void @bar_internal() {
+    ret void
+}
+
+define void @dead_func() {
+    call void @bar()
+    call void @baz()
+    call void @bar_internal()
+    ret void
+}
+
+define void @main() {
+    call void @bar()
+    call void @bar_internal()
+    ret void
+}
diff --git a/test/ThinLTO/X86/lazyload_metadata.ll b/test/ThinLTO/X86/lazyload_metadata.ll
new file mode 100644
index 00000000000..3c4345831aa
--- /dev/null
+++ b/test/ThinLTO/X86/lazyload_metadata.ll
@@ -0,0 +1,54 @@
+; Do setup work for all below tests: generate bitcode and combined index
+; RUN: opt -module-summary %s -o %t.bc -bitcode-mdindex-threshold=0
+; RUN: opt -module-summary %p/Inputs/lazyload_metadata.ll -o %t2.bc -bitcode-mdindex-threshold=0
+; RUN: llvm-lto -thinlto-action=thinlink -o %t3.bc %t.bc %t2.bc
+; REQUIRES: asserts
+
+; Check that importing @globalfunc1 does not trigger loading all the global
+; metadata for @globalfunc2 and @globalfunc3
+
+; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
+; RUN:          -o /dev/null -stats \
+; RUN:  2>&1 | FileCheck %s -check-prefix=LAZY
+; LAZY: 49 bitcode-reader  - Number of Metadata records loaded
+; LAZY: 1 bitcode-reader  - Number of MDStrings loaded
+
+; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
+; RUN:          -o /dev/null -disable-ondemand-mds-loading -stats \
+; RUN:  2>&1 | FileCheck %s -check-prefix=NOTLAZY
+; NOTLAZY: 58 bitcode-reader  - Number of Metadata records loaded
+; NOTLAZY: 8 bitcode-reader  - Number of MDStrings loaded
+
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define void @globalfunc1(i32 %arg) {
+  %tmp = add i32 %arg, 0, !metadata !2
+  ret void
+}
+
+; We need two functions here that will both reference the same metadata.
+; This is to force the metadata to be emitted in the global metadata block and
+; not in the function specific metadata.
+; These function are not imported and so we don't want to load their metadata.
+
+define void @globalfunc2(i32 %arg) {
+  %tmp = add i32 %arg, 0, !metadata !1
+  ret void
+}
+
+define void @globalfunc3(i32 %arg) {
+  %tmp = add i32 %arg, 0, !metadata !1
+  ret void
+}
+
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"Hello World"}
+!3 = !{!"3"}
+!4 = !{!"4"}
+!5 = !{!"5"}
+!6 = !{!"6"}
+!7 = !{!"7"}
+!8 = !{!"8"}
+!9 = !{!"9"}
diff --git a/test/Transforms/GVN/PRE/phi-translate.ll b/test/Transforms/GVN/PRE/phi-translate.ll
index 42335486910..1f6c7c8d33e 100644
--- a/test/Transforms/GVN/PRE/phi-translate.ll
+++ b/test/Transforms/GVN/PRE/phi-translate.ll
@@ -4,18 +4,17 @@ target datalayout = "e-p:64:64:64"
 
 ; CHECK-LABEL: @foo(
 ; CHECK: entry.end_crit_edge:
-; CHECK:   %j.phi.trans.insert = sext i32 %x to i64, !dbg [[J_LOC:![0-9]+]]
-; CHECK:   %q.phi.trans.insert = getelementptr {{.*}}, !dbg [[Q_LOC:![0-9]+]]
-; CHECK:   %n.pre = load i32, i32* %q.phi.trans.insert, !dbg [[N_LOC:![0-9]+]]
+; CHECK: %[[INDEX:[a-z0-9.]+]] = sext i32 %x to i64{{$}}
+; CHECK: %[[ADDRESS:[a-z0-9.]+]] = getelementptr [100 x i32], [100 x i32]* @G, i64 0, i64 %[[INDEX]]{{$}}
+; CHECK:   %n.pre = load i32, i32* %[[ADDRESS]]{{$}}
+; CHECK: br label %end
 ; CHECK: then:
 ; CHECK:   store i32 %z
 ; CHECK: end:
-; CHECK:   %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC]]
+; CHECK:   %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC:![0-9]+]]
 ; CHECK:   ret i32 %n
 
-; CHECK-DAG: [[J_LOC]] = !DILocation(line: 45, column: 1, scope: !{{.*}})
-; CHECK-DAG: [[Q_LOC]] = !DILocation(line: 46, column: 1, scope: !{{.*}})
-; CHECK-DAG: [[N_LOC]] = !DILocation(line: 47, column: 1, scope: !{{.*}})
+; CHECK: [[N_LOC]] = !DILocation(line: 47, column: 1, scope: !{{.*}})
 
 @G = external global [100 x i32]
 define i32 @foo(i32 %x, i32 %z) !dbg !6 {
diff --git a/test/Transforms/InstCombine/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/amdgcn-intrinsics.ll
index 3c38e789062..a228968f25b 100644
--- a/test/Transforms/InstCombine/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/amdgcn-intrinsics.ll
@@ -599,3 +599,37 @@ define i1 @test_constant_class_snan_test_pinf_f64() nounwind {
   %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 512)
   ret i1 %val
 }
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cos
+; --------------------------------------------------------------------
+declare float @llvm.amdgcn.cos.f32(float) nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; CHECK-LABEL: @cos_fneg_f32(
+; CHECK: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fneg_f32(float %x) {
+  %x.fneg = fsub float -0.0, %x
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fneg)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_f32(
+; CHECK-NEXT: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_fneg_f32(
+; CHECK-NEXT: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_fneg_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %x.fabs.fneg = fsub float -0.0, %x.fabs
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs.fneg)
+  ret float %cos
+}
diff --git a/test/Transforms/InstCombine/cos-intrinsic.ll b/test/Transforms/InstCombine/cos-intrinsic.ll
index b4d07cf8047..24b605e9c8c 100644
--- a/test/Transforms/InstCombine/cos-intrinsic.ll
+++ b/test/Transforms/InstCombine/cos-intrinsic.ll
@@ -3,6 +3,10 @@
 
 declare double    @llvm.cos.f64(double %Val)
 declare float     @llvm.cos.f32(float %Val)
+declare <2 x float> @llvm.cos.v2f32(<2 x float> %Val)
+
+declare float @llvm.fabs.f32(float %Val)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float> %Val)
 
 ; Function Attrs: nounwind readnone
 define double @test1() {
@@ -24,3 +28,54 @@ define float @test2(float %d) {
 ; CHECK-NEXT: %fsum
 ; CHECK: ret float %fsum
 }
+
+; CHECK-LABEL: @cos_fneg_f32(
+; CHECK: %cos = call float @llvm.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fneg_f32(float %x) {
+  %x.fneg = fsub float -0.0, %x
+  %cos = call float @llvm.cos.f32(float %x.fneg)
+  ret float %cos
+}
+
+; FIXME: m_FNeg() doesn't handle vectors
+; CHECK-LABEL: @cos_fneg_v2f32(
+; CHECK: %x.fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %x
+; CHECK-NEXT: %cos = call <2 x float> @llvm.cos.v2f32(<2 x float> %x.fneg)
+; CHECK-NEXT: ret <2 x float> %cos
+define <2 x float> @cos_fneg_v2f32(<2 x float> %x) {
+  %x.fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
+  %cos = call <2 x float> @llvm.cos.v2f32(<2 x float> %x.fneg)
+  ret <2 x float> %cos
+}
+
+; CHECK-LABEL: @cos_fabs_f32(
+; CHECK-NEXT: %cos = call float @llvm.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %cos = call float @llvm.cos.f32(float %x.fabs)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_fneg_f32(
+; CHECK: %cos = call float @llvm.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_fneg_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %x.fabs.fneg = fsub float -0.0, %x.fabs
+  %cos = call float @llvm.cos.f32(float %x.fabs.fneg)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_fneg_v2f32(
+; CHECK: %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
+; CHECK-NEXT: %x.fabs.fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %x.fabs
+; CHECK-NEXT: %cos = call <2 x float> @llvm.cos.v2f32(<2 x float> %x.fabs.fneg)
+; CHECK-NEXT: ret <2 x float> %cos
+define <2 x float> @cos_fabs_fneg_v2f32(<2 x float> %x) {
+  %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
+  %x.fabs.fneg = fsub <2 x float> <float -0.0, float -0.0>, %x.fabs
+  %cos = call <2 x float> @llvm.cos.v2f32(<2 x float> %x.fabs.fneg)
+  ret <2 x float> %cos
+}
diff --git a/test/Transforms/InstCombine/icmp-shl-nsw.ll b/test/Transforms/InstCombine/icmp-shl-nsw.ll
new file mode 100644
index 00000000000..896a45625b9
--- /dev/null
+++ b/test/Transforms/InstCombine/icmp-shl-nsw.ll
@@ -0,0 +1,218 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; If the (shl x, C) preserved the sign and this is a sign test,
+; compare the LHS operand instead
+
+define i1 @icmp_shl_nsw_sgt(i32 %x) {
+; CHECK-LABEL: @icmp_shl_nsw_sgt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i32 %x, 21
+  %cmp = icmp sgt i32 %shl, 0
+  ret i1 %cmp
+}
+
+define i1 @icmp_shl_nsw_sge0(i32 %x) {
+; CHECK-LABEL: @icmp_shl_nsw_sge0(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i32 %x, 21
+  %cmp = icmp sge i32 %shl, 0
+  ret i1 %cmp
+}
+
+define i1 @icmp_shl_nsw_sge1(i32 %x) {
+; CHECK-LABEL: @icmp_shl_nsw_sge1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i32 %x, 21
+  %cmp = icmp sge i32 %shl, 1
+  ret i1 %cmp
+}
+
+define <2 x i1> @icmp_shl_nsw_sge1_vec(<2 x i32> %x) {
+; CHECK-LABEL: @icmp_shl_nsw_sge1_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i32> %x, zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %shl = shl nsw <2 x i32> %x, <i32 21, i32 21>
+  %cmp = icmp sge <2 x i32> %shl, <i32 1, i32 1>
+  ret <2 x i1> %cmp
+}
+
+; Checks for icmp (eq|ne) (shl x, C), 0
+
+define i1 @icmp_shl_nsw_eq(i32 %x) {
+; CHECK-LABEL: @icmp_shl_nsw_eq(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 %x, 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %mul = shl nsw i32 %x, 5
+  %cmp = icmp eq i32 %mul, 0
+  ret i1 %cmp
+}
+
+define <2 x i1> @icmp_shl_nsw_eq_vec(<2 x i32> %x) {
+; CHECK-LABEL: @icmp_shl_nsw_eq_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> %x, zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %mul = shl nsw <2 x i32> %x, <i32 5, i32 5>
+  %cmp = icmp eq <2 x i32> %mul, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+; icmp sgt with shl nsw with a constant compare operand and constant
+; shift amount can always be reduced to icmp sgt alone.
+
+; Known bits analysis turns this into an equality predicate.
+
+define i1 @icmp_sgt1(i8 %x) {
+; CHECK-LABEL: @icmp_sgt1(
+; CHECK-NEXT:    [[SHL_MASK:%.*]] = and i8 %x, 127
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[SHL_MASK]], 64
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, -128
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt2(i8 %x) {
+; CHECK-LABEL: @icmp_sgt2(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -127
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, -127
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt3(i8 %x) {
+; CHECK-LABEL: @icmp_sgt3(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -16
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, -16
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt4(i8 %x) {
+; CHECK-LABEL: @icmp_sgt4(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -2
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, -2
+  ret i1 %cmp
+}
+
+; x >s -1 is a sign bit test.
+; x >s 0 is a sign bit test.
+
+define i1 @icmp_sgt5(i8 %x) {
+; CHECK-LABEL: @icmp_sgt5(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, 1
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt6(i8 %x) {
+; CHECK-LABEL: @icmp_sgt6(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], 16
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, 16
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt7(i8 %x) {
+; CHECK-LABEL: @icmp_sgt7(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], 124
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, 124
+  ret i1 %cmp
+}
+
+; Known bits analysis turns this into an equality predicate.
+
+define i1 @icmp_sgt8(i8 %x) {
+; CHECK-LABEL: @icmp_sgt8(
+; CHECK-NEXT:    [[SHL_MASK:%.*]] = and i8 %x, 127
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHL_MASK]], 63
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 1
+  %cmp = icmp sgt i8 %shl, 125
+  ret i1 %cmp
+}
+
+; Compares with 126 and 127 are recognized as always false.
+
+; Known bits analysis turns this into an equality predicate.
+
+define i1 @icmp_sgt9(i8 %x) {
+; CHECK-LABEL: @icmp_sgt9(
+; CHECK-NEXT:    [[SHL_MASK:%.*]] = and i8 %x, 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHL_MASK]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 7
+  %cmp = icmp sgt i8 %shl, -128
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt10(i8 %x) {
+; CHECK-LABEL: @icmp_sgt10(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -127
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 7
+  %cmp = icmp sgt i8 %shl, -127
+  ret i1 %cmp
+}
+
+define i1 @icmp_sgt11(i8 %x) {
+; CHECK-LABEL: @icmp_sgt11(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i8 %x, 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[SHL]], -2
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl nsw i8 %x, 7
+  %cmp = icmp sgt i8 %shl, -2
+  ret i1 %cmp
+}
+
+; Splat vector version should fold the same way.
+
+define <2 x i1> @icmp_sgt11_vec(<2 x i8> %x) {
+; CHECK-LABEL: @icmp_sgt11_vec(
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw <2 x i8> %x, <i8 7, i8 7>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> [[SHL]], <i8 -2, i8 -2>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %shl = shl nsw <2 x i8> %x, <i8 7, i8 7>
+  %cmp = icmp sgt <2 x i8> %shl, <i8 -2, i8 -2>
+  ret <2 x i1> %cmp
+}
+
+; Known bits analysis returns false for compares with >=0.
+
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 099aaca5f7d..32fe050bf83 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -1243,69 +1243,6 @@ define i1 @icmp_shl24(i32 %x) {
   ret i1 %cmp
 }
 
-; If the (shl x, C) preserved the sign and this is a sign test,
-; compare the LHS operand instead
-define i1 @icmp_shl_nsw_sgt(i32 %x) {
-; CHECK-LABEL: @icmp_shl_nsw_sgt(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %shl = shl nsw i32 %x, 21
-  %cmp = icmp sgt i32 %shl, 0
-  ret i1 %cmp
-}
-
-define i1 @icmp_shl_nsw_sge0(i32 %x) {
-; CHECK-LABEL: @icmp_shl_nsw_sge0(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, -1
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %shl = shl nsw i32 %x, 21
-  %cmp = icmp sge i32 %shl, 0
-  ret i1 %cmp
-}
-
-define i1 @icmp_shl_nsw_sge1(i32 %x) {
-; CHECK-LABEL: @icmp_shl_nsw_sge1(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %shl = shl nsw i32 %x, 21
-  %cmp = icmp sge i32 %shl, 1
-  ret i1 %cmp
-}
-
-define <2 x i1> @icmp_shl_nsw_sge1_vec(<2 x i32> %x) {
-; CHECK-LABEL: @icmp_shl_nsw_sge1_vec(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i32> %x, zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[CMP]]
-;
-  %shl = shl nsw <2 x i32> %x, <i32 21, i32 21>
-  %cmp = icmp sge <2 x i32> %shl, <i32 1, i32 1>
-  ret <2 x i1> %cmp
-}
-
-; Checks for icmp (eq|ne) (shl x, C), 0
-define i1 @icmp_shl_nsw_eq(i32 %x) {
-; CHECK-LABEL: @icmp_shl_nsw_eq(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 %x, 0
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %mul = shl nsw i32 %x, 5
-  %cmp = icmp eq i32 %mul, 0
-  ret i1 %cmp
-}
-
-define <2 x i1> @icmp_shl_nsw_eq_vec(<2 x i32> %x) {
-; CHECK-LABEL: @icmp_shl_nsw_eq_vec(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> %x, zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[CMP]]
-;
-  %mul = shl nsw <2 x i32> %x, <i32 5, i32 5>
-  %cmp = icmp eq <2 x i32> %mul, zeroinitializer
-  ret <2 x i1> %cmp
-}
-
 define i1 @icmp_shl_eq(i32 %x) {
 ; CHECK-LABEL: @icmp_shl_eq(
 ; CHECK-NEXT:    [[MUL_MASK:%.*]] = and i32 %x, 134217727
diff --git a/test/Transforms/InstSimplify/select.ll b/test/Transforms/InstSimplify/select.ll
index 6ddaaba2046..1acb5c469d3 100644
--- a/test/Transforms/InstSimplify/select.ll
+++ b/test/Transforms/InstSimplify/select.ll
@@ -402,3 +402,31 @@ define i32* @select_icmp_pointers(i32* %x, i32* %y) {
   ret i32* %sel
 }
 
+; FIXME: If the condition is known, we don't need to select.
+
+declare void @llvm.assume(i1)
+
+define i8 @assume_sel_cond(i1 %cond, i8 %x, i8 %y) {
+; CHECK-LABEL: @assume_sel_cond(
+; CHECK-NEXT:    call void @llvm.assume(i1 %cond)
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 %cond, i8 %x, i8 %y
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  call void @llvm.assume(i1 %cond)
+  %sel = select i1 %cond, i8 %x, i8 %y
+  ret i8 %sel
+}
+
+define i8 @do_not_assume_sel_cond(i1 %cond, i8 %x, i8 %y) {
+; CHECK-LABEL: @do_not_assume_sel_cond(
+; CHECK-NEXT:    [[NOTCOND:%.*]] = icmp eq i1 %cond, false
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTCOND]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 %cond, i8 %x, i8 %y
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %notcond = icmp eq i1 %cond, false
+  call void @llvm.assume(i1 %notcond)
+  %sel = select i1 %cond, i8 %x, i8 %y
+  ret i8 %sel
+}
+
diff --git a/test/Transforms/LICM/scalar_promote.ll b/test/Transforms/LICM/scalar_promote.ll
index 91cdbdbc226..dc5151be8a8 100644
--- a/test/Transforms/LICM/scalar_promote.ll
+++ b/test/Transforms/LICM/scalar_promote.ll
@@ -186,6 +186,198 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; CHECK-NEXT:  store i32 %[[LCSSAPHI]], i32* %gi, align 4, !tbaa !0
 }
 
+declare i32 @opaque(i32) argmemonly
+declare void @capture(i32*)
+
+; We can promote even if opaque may throw.
+define i32 @test7() {
+; CHECK-LABEL: @test7(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %loop ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %loop ]
+  %x = load i32, i32* %local
+  %x2 = call i32 @opaque(i32 %x) ; Note this does not capture %local
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+; Make sure we don't promote if the store is really control-flow dependent.
+define i32 @test7bad() {
+; CHECK-LABEL: @test7bad(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: br label %loop
+; CHECK: if:
+; CHECK-NEXT: store i32 %x2, i32* %local
+; CHECK-NEXT: br label %else
+; CHECK: exit:
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  br label %loop
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]
+  %x = load i32, i32* %local
+  %x2 = call i32 @opaque(i32 %x) ; Note this does not capture %local
+  %cmp = icmp eq i32 %x2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  store i32 %x2, i32* %local
+  br label %else
+
+else:
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+; Even if neither the load nor the store or guaranteed to execute because
+; opaque() may throw, we can still promote - the load not being guaranteed
+; doesn't block us, because %local is always dereferenceable.
+define i32 @test8() {
+; CHECK-LABEL: @test8(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %loop ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %loop ]
+  %throwaway = call i32 @opaque(i32 %j)
+  %x = load i32, i32* %local  
+  %x2 = call i32 @opaque(i32 %x)
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+
+; If the store is "guaranteed modulo exceptions", and the load depends on
+; control flow, we can only promote if the pointer is otherwise known to be
+; dereferenceable
+define i32 @test9() {
+; CHECK-LABEL: @test9(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %else ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]  
+  %j2 = call i32 @opaque(i32 %j)
+  %cmp = icmp eq i32 %j2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  %x = load i32, i32* %local
+  br label %else
+
+else:
+  %x2 = phi i32 [ 0, %loop ], [ %x, %if]
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+define i32 @test9bad(i32 %i) {
+; CHECK-LABEL: @test9bad(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: %notderef = getelementptr
+; CHECK-NEXT: br label %loop
+; CHECK: if:
+; CHECK-NEXT: load i32, i32* %notderef
+; CHECK-NEXT: br label %else
+; CHECK: exit:
+; CHECK-NEXT: %ret = load i32, i32* %notderef
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  %notderef = getelementptr i32, i32* %local, i32 %i
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]  
+  %j2 = call i32 @opaque(i32 %j)
+  %cmp = icmp eq i32 %j2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  %x = load i32, i32* %notderef
+  br label %else
+
+else:
+  %x2 = phi i32 [ 0, %loop ], [ %x, %if]
+  store i32 %x2, i32* %notderef
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %notderef
+  ret i32 %ret
+}
+
 !0 = !{!4, !4, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
new file mode 100644
index 00000000000..645f3360543
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -0,0 +1,54 @@
+; This test checks that the given loop still beneficial for vecotization
+; even if it contains scalarized load (gather on AVX2)
+;RUN: opt < %s -loop-vectorize -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 {
+entry:
+  %idxprom = sext i32 %i to i64
+  %idxprom5 = sext i32 %j to i64
+  br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add7
+
+  for.body:                                         ; preds = %for.body, %entry
+  ; the loop gets vectorized
+  ; first consecutive load as vector load
+  ; CHECK: %wide.load = load <8 x i32>
+  ; second strided load scalarized
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ]
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %idxprom, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx2, align 4, !tbaa !1
+  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %indvars.iv, i64 %idxprom5
+  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1
+  %mul = mul nsw i32 %1, %0
+  %add = add i32 %sum.015, 4
+  %add7 = add i32 %add, %mul
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 4.0.0 (cfe/trunk 284570)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml b/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
new file mode 100644
index 00000000000..d2a3ef81a3a
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
@@ -0,0 +1,10 @@
+---
+GlobalValueMap:
+  42:
+    - TypeTests: [123]
+TypeIdMap:
+  typeid1:
+    TTRes:
+      Kind: Unsat
+      SizeBitWidth: 0
+...
diff --git a/test/Transforms/LowerTypeTests/export-nothing.ll b/test/Transforms/LowerTypeTests/export-nothing.ll
new file mode 100644
index 00000000000..9ab41b5f6cb
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/export-nothing.ll
@@ -0,0 +1,7 @@
+; RUN: opt -lowertypetests -lowertypetests-summary-action=export -lowertypetests-write-summary=%t -o /dev/null %s
+; RUN: FileCheck %s < %t
+
+; CHECK: ---
+; CHECK-NEXT: GlobalValueMap:
+; CHECK-NEXT: TypeIdMap:
+; CHECK-NEXT: ...
diff --git a/test/Transforms/LowerTypeTests/function-disjoint.ll b/test/Transforms/LowerTypeTests/function-disjoint.ll
index 0f9d4a32d15..f39c8eec47c 100644
--- a/test/Transforms/LowerTypeTests/function-disjoint.ll
+++ b/test/Transforms/LowerTypeTests/function-disjoint.ll
@@ -30,10 +30,10 @@ declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
 
 define i1 @foo(i8* %p) {
   ; X64: icmp eq i64 {{.*}}, ptrtoint (void ()* @[[JT0]] to i64)
-  ; WASM32: icmp eq i64 {{.*}}, 1
+  ; WASM32: icmp eq i64 {{.*}}, ptrtoint (i8* getelementptr (i8, i8* null, i64 1) to i64)
   %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
   ; X64: icmp eq i64 {{.*}}, ptrtoint (void ()* @[[JT1]] to i64)
-  ; WASM32: icmp eq i64 {{.*}}, 2
+  ; WASM32: icmp eq i64 {{.*}}, mul (i64 ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64), i64 2)
   %y = call i1 @llvm.type.test(i8* %p, metadata !"typeid2")
   %z = add i1 %x, %y
   ret i1 %z
@@ -46,4 +46,4 @@ define i1 @foo(i8* %p) {
 ; X64:   call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(void ()* @g.cfi)
 
 ; WASM32: ![[I0]] = !{i64 1}
-; WASM32: ![[I1]] = !{i64 2}
\ No newline at end of file
+; WASM32: ![[I1]] = !{i64 2}
diff --git a/test/Transforms/LowerTypeTests/function-ext.ll b/test/Transforms/LowerTypeTests/function-ext.ll
index 7ed4330a8b5..8318cad89d5 100644
--- a/test/Transforms/LowerTypeTests/function-ext.ll
+++ b/test/Transforms/LowerTypeTests/function-ext.ll
@@ -11,8 +11,7 @@ declare !type !0 void @foo()
 
 define i1 @bar(i8* %ptr) {
   ; X64: icmp eq i64 {{.*}}, ptrtoint (void ()* @[[JT:.*]] to i64)
-  ; WASM32: sub i64 {{.*}}, 0
-  ; WASM32: icmp ult i64 {{.*}}, 1
+  ; WASM32: ret i1 false
   %p = call i1 @llvm.type.test(i8* %ptr, metadata !"void")
   ret i1 %p
 }
diff --git a/test/Transforms/LowerTypeTests/function.ll b/test/Transforms/LowerTypeTests/function.ll
index 28782582915..9abea8f854c 100644
--- a/test/Transforms/LowerTypeTests/function.ll
+++ b/test/Transforms/LowerTypeTests/function.ll
@@ -42,7 +42,7 @@ declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
 
 define i1 @foo(i8* %p) {
   ; NATIVE: sub i64 {{.*}}, ptrtoint (void ()* @[[JT]] to i64)
-  ; WASM32: sub i64 {{.*}}, 1
+  ; WASM32: sub i64 {{.*}}, ptrtoint (i8* getelementptr (i8, i8* null, i64 1) to i64)
   ; WASM32: icmp ult i64 {{.*}}, 2
   %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
   ret i1 %x
diff --git a/test/Transforms/LowerTypeTests/import-unsat.ll b/test/Transforms/LowerTypeTests/import-unsat.ll
new file mode 100644
index 00000000000..7ca70f2636f
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/import-unsat.ll
@@ -0,0 +1,23 @@
+; Test that we correctly import an unsat resolution for type identifier "typeid1".
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%S/Inputs/import-unsat.yaml -lowertypetests-write-summary=%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+; SUMMARY:      GlobalValueMap:
+; SUMMARY-NEXT:   42:
+; SUMMARY-NEXT:     - TypeTests:
+; SUMMARY-NEXT:         - 123
+; SUMMARY-NEXT: TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeBitWidth:    0
+
+target datalayout = "e-p:32:32"
+
+declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
+
+define i1 @foo(i8* %p) {
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
+  ; CHECK: ret i1 false
+  ret i1 %x
+}
diff --git a/test/Transforms/LowerTypeTests/simple.ll b/test/Transforms/LowerTypeTests/simple.ll
index 0628951868c..91b94184420 100644
--- a/test/Transforms/LowerTypeTests/simple.ll
+++ b/test/Transforms/LowerTypeTests/simple.ll
@@ -92,7 +92,7 @@ define i1 @bar(i32* %p) {
   ; CHECK: [[S0:%[^ ]*]] = bitcast i32* [[B0]] to i8*
   %pi8 = bitcast i32* %p to i8*
   ; CHECK: [[S1:%[^ ]*]] = ptrtoint i8* [[S0]] to i32
-  ; CHECK: [[S2:%[^ ]*]] = sub i32 [[S1]], add (i32 ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32), i32 4)
+  ; CHECK: [[S2:%[^ ]*]] = sub i32 [[S1]], ptrtoint (i8* getelementptr (i8, i8* bitcast ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i8*), i32 4) to i32)
   ; CHECK: [[S3:%[^ ]*]] = lshr i32 [[S2]], 8
   ; CHECK: [[S4:%[^ ]*]] = shl i32 [[S2]], 24
   ; CHECK: [[S5:%[^ ]*]] = or i32 [[S3]], [[S4]]
diff --git a/test/Transforms/LowerTypeTests/single-offset.ll b/test/Transforms/LowerTypeTests/single-offset.ll
index 6dd37984df9..8d2c0e831cd 100644
--- a/test/Transforms/LowerTypeTests/single-offset.ll
+++ b/test/Transforms/LowerTypeTests/single-offset.ll
@@ -24,7 +24,7 @@ define i1 @foo(i8* %p) {
 ; CHECK: @bar(i8* [[B0:%[^ ]*]])
 define i1 @bar(i8* %p) {
   ; CHECK: [[S0:%[^ ]*]] = ptrtoint i8* [[B0]] to i32
-  ; CHECK: [[S1:%[^ ]*]] = icmp eq i32 [[S0]], add (i32 ptrtoint ({ i32, [0 x i8], i32 }* [[G]] to i32), i32 4)
+  ; CHECK: [[S1:%[^ ]*]] = icmp eq i32 [[S0]],  ptrtoint (i8* getelementptr (i8, i8* bitcast ({ i32, [0 x i8], i32 }* [[G]] to i8*), i32 4) to i32)
   %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid3")
   ; CHECK: ret i1 [[S1]]
   ret i1 %x
diff --git a/test/Transforms/LowerTypeTests/unsat.ll b/test/Transforms/LowerTypeTests/unsat.ll
index 5bafc9e8c40..e797baf59bc 100644
--- a/test/Transforms/LowerTypeTests/unsat.ll
+++ b/test/Transforms/LowerTypeTests/unsat.ll
@@ -1,5 +1,4 @@
-; FIXME: We should not require -O2 to simplify this to return false.
-; RUN: opt -S -lowertypetests -O2 < %s | FileCheck %s
+; RUN: opt -S -lowertypetests < %s | FileCheck %s
 
 target datalayout = "e-p:32:32"
 
diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp
index 018d91d6c31..ecd631c1039 100644
--- a/tools/dsymutil/DwarfLinker.cpp
+++ b/tools/dsymutil/DwarfLinker.cpp
@@ -1796,8 +1796,7 @@ static bool analyzeContextInfo(const DWARFDie &DIE,
 
   Info.Prune = InImportedModule;
   if (DIE.hasChildren())
-    for (auto Child = DIE.getFirstChild(); Child && !Child.isNULL();
-         Child = Child.getSibling())
+    for (auto Child: DIE.children())
       Info.Prune &= analyzeContextInfo(Child, MyIdx, CU, CurrentDeclContext,
                                        StringPool, Contexts, InImportedModule);
 
@@ -2294,8 +2293,7 @@ void DwarfLinker::lookForDIEsToKeep(RelocationManager &RelocMgr,
   if (!Die.hasChildren() || (Flags & TF_ParentWalk))
     return;
 
-  for (auto Child = Die.getFirstChild(); Child && !Child.isNULL();
-       Child = Child.getSibling())
+  for (auto Child: Die.children())
     lookForDIEsToKeep(RelocMgr, Child, DMO, CU, Flags);
 }
 
@@ -2814,8 +2812,7 @@ DIE *DwarfLinker::DIECloner::cloneDIE(
 
   // Determine whether there are any children that we want to keep.
   bool HasChildren = false;
-  for (auto Child = InputDIE.getFirstChild(); Child && !Child.isNULL();
-       Child = Child.getSibling()) {
+  for (auto Child: InputDIE.children()) {
     unsigned Idx = U.getDIEIndex(Child);
     if (Unit.getInfo(Idx).Keep) {
       HasChildren = true;
@@ -2840,8 +2837,7 @@ DIE *DwarfLinker::DIECloner::cloneDIE(
   }
 
   // Recursively clone children.
-  for (auto Child = InputDIE.getFirstChild(); Child && !Child.isNULL();
-       Child = Child.getSibling()) {
+  for (auto Child: InputDIE.children()) {
     if (DIE *Clone = cloneDIE(Child, Unit, PCOffset, OutOffset, Flags)) {
       Die->addChild(Clone);
       OutOffset = Clone->getOffset() + Clone->getSize();
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index aa0beb45b4e..a76d3249674 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -134,6 +134,8 @@ static cl::opt<std::string> StartAfter("start-after",
     cl::desc("Resume compilation after a specific pass"),
     cl::value_desc("pass-name"), cl::init(""));
 
+static cl::list<std::string> IncludeDirs("I", cl::desc("include search path"));
+
 namespace {
 static ManagedStatic<std::vector<std::string>> RunPassNames;
 
@@ -398,6 +400,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
   Options.MCOptions.MCUseDwarfDirectory = EnableDwarfDirectory;
   Options.MCOptions.AsmVerbose = AsmVerbose;
   Options.MCOptions.PreserveAsmComments = PreserveComments;
+  Options.MCOptions.IASSearchPaths = IncludeDirs;
 
   std::unique_ptr<TargetMachine> Target(
       TheTarget->createTargetMachine(TheTriple.getTriple(), CPUStr, FeaturesStr,
diff --git a/tools/llvm-config/llvm-config.cpp b/tools/llvm-config/llvm-config.cpp
index d780094861c..58571cc59b7 100644
--- a/tools/llvm-config/llvm-config.cpp
+++ b/tools/llvm-config/llvm-config.cpp
@@ -196,6 +196,7 @@ Options:\n\
   --bindir          Directory containing LLVM executables.\n\
   --includedir      Directory containing LLVM headers.\n\
   --libdir          Directory containing LLVM libraries.\n\
+  --cmakedir        Directory containing LLVM cmake modules.\n\
   --cppflags        C preprocessor flags for files that include LLVM headers.\n\
   --cflags          C compiler flags for files that include LLVM headers.\n\
   --cxxflags        C++ compiler flags for files that include LLVM headers.\n\
@@ -302,7 +303,8 @@ int main(int argc, char **argv) {
 
   // Compute various directory locations based on the derived location
   // information.
-  std::string ActivePrefix, ActiveBinDir, ActiveIncludeDir, ActiveLibDir;
+  std::string ActivePrefix, ActiveBinDir, ActiveIncludeDir, ActiveLibDir,
+              ActiveCMakeDir;
   std::string ActiveIncludeOption;
   if (IsInDevelopmentTree) {
     ActiveIncludeDir = std::string(LLVM_SRC_ROOT) + "/include";
@@ -314,12 +316,14 @@ int main(int argc, char **argv) {
     case CMakeStyle:
       ActiveBinDir = ActiveObjRoot + "/bin";
       ActiveLibDir = ActiveObjRoot + "/lib" + LLVM_LIBDIR_SUFFIX;
+      ActiveCMakeDir = ActiveLibDir + "/cmake/llvm";
       break;
     case CMakeBuildModeStyle:
       ActivePrefix = ActiveObjRoot;
       ActiveBinDir = ActiveObjRoot + "/bin/" + build_mode;
       ActiveLibDir =
           ActiveObjRoot + "/lib" + LLVM_LIBDIR_SUFFIX + "/" + build_mode;
+      ActiveCMakeDir = ActiveLibDir + "/cmake/llvm";
       break;
     }
 
@@ -331,6 +335,7 @@ int main(int argc, char **argv) {
     ActiveIncludeDir = ActivePrefix + "/include";
     ActiveBinDir = ActivePrefix + "/bin";
     ActiveLibDir = ActivePrefix + "/lib" + LLVM_LIBDIR_SUFFIX;
+    ActiveCMakeDir = ActiveLibDir + "/cmake/llvm";
     ActiveIncludeOption = "-I" + ActiveIncludeDir;
   }
 
@@ -357,6 +362,7 @@ int main(int argc, char **argv) {
       std::replace(ActivePrefix.begin(), ActivePrefix.end(), '/', '\\');
       std::replace(ActiveBinDir.begin(), ActiveBinDir.end(), '/', '\\');
       std::replace(ActiveLibDir.begin(), ActiveLibDir.end(), '/', '\\');
+      std::replace(ActiveCMakeDir.begin(), ActiveCMakeDir.end(), '/', '\\');
       std::replace(ActiveIncludeOption.begin(), ActiveIncludeOption.end(), '/',
                    '\\');
     }
@@ -475,6 +481,8 @@ int main(int argc, char **argv) {
         OS << ActiveIncludeDir << '\n';
       } else if (Arg == "--libdir") {
         OS << ActiveLibDir << '\n';
+      } else if (Arg == "--cmakedir") {
+        OS << ActiveCMakeDir << '\n';
       } else if (Arg == "--cppflags") {
         OS << ActiveIncludeOption << ' ' << LLVM_CPPFLAGS << '\n';
       } else if (Arg == "--cflags") {
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index a273005cd1f..83fb213109a 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -1500,10 +1500,6 @@ TEST(APFloatTest, PPCDoubleDouble) {
   EXPECT_EQ(0x3ff0000000000000ull, test.bitcastToAPInt().getRawData()[0]);
   EXPECT_EQ(0x0000000000000000ull, test.bitcastToAPInt().getRawData()[1]);
 
-  test.divide(APFloat(APFloat::PPCDoubleDouble(), "3.0"), APFloat::rmNearestTiesToEven);
-  EXPECT_EQ(0x3fd5555555555555ull, test.bitcastToAPInt().getRawData()[0]);
-  EXPECT_EQ(0x3c75555555555556ull, test.bitcastToAPInt().getRawData()[1]);
-
   // LDBL_MAX
   test = APFloat(APFloat::PPCDoubleDouble(), "1.79769313486231580793728971405301e+308");
   EXPECT_EQ(0x7fefffffffffffffull, test.bitcastToAPInt().getRawData()[0]);
@@ -3306,4 +3302,275 @@ TEST(APFloatTest, PPCDoubleDoubleSubtract) {
                .str();
   }
 }
+
+TEST(APFloatTest, PPCDoubleDoubleMultiply) {
+  using DataType = std::tuple<uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
+                              uint64_t, APFloat::roundingMode>;
+  // TODO: Only a sanity check for now. Add more edge cases when the
+  // double-double algorithm is implemented.
+  DataType Data[] = {
+      // 1/3 * 3 = 1.0
+      std::make_tuple(0x3fd5555555555555ull, 0x3c75555555555556ull,
+                      0x4008000000000000ull, 0, 0x3ff0000000000000ull, 0,
+                      APFloat::rmNearestTiesToEven),
+  };
+
+  for (auto Tp : Data) {
+    uint64_t Op1[2], Op2[2], Expected[2];
+    APFloat::roundingMode RM;
+    std::tie(Op1[0], Op1[1], Op2[0], Op2[1], Expected[0], Expected[1], RM) = Tp;
+
+    APFloat A1(APFloat::PPCDoubleDouble(), APInt(128, 2, Op1));
+    APFloat A2(APFloat::PPCDoubleDouble(), APInt(128, 2, Op2));
+    A1.multiply(A2, RM);
+
+    EXPECT_EQ(Expected[0], A1.bitcastToAPInt().getRawData()[0])
+        << formatv("({0:x} + {1:x}) * ({2:x} + {3:x})", Op1[0], Op1[1], Op2[0],
+                   Op2[1])
+               .str();
+    EXPECT_EQ(Expected[1], A1.bitcastToAPInt().getRawData()[1])
+        << formatv("({0:x} + {1:x}) * ({2:x} + {3:x})", Op1[0], Op1[1], Op2[0],
+                   Op2[1])
+               .str();
+  }
+}
+
+TEST(APFloatTest, PPCDoubleDoubleDivide) {
+  using DataType = std::tuple<uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
+                              uint64_t, APFloat::roundingMode>;
+  // TODO: Only a sanity check for now. Add more edge cases when the
+  // double-double algorithm is implemented.
+  DataType Data[] = {
+      // 1 / 3 = 1/3
+      std::make_tuple(0x3ff0000000000000ull, 0, 0x4008000000000000ull, 0,
+                      0x3fd5555555555555ull, 0x3c75555555555556ull,
+                      APFloat::rmNearestTiesToEven),
+  };
+
+  for (auto Tp : Data) {
+    uint64_t Op1[2], Op2[2], Expected[2];
+    APFloat::roundingMode RM;
+    std::tie(Op1[0], Op1[1], Op2[0], Op2[1], Expected[0], Expected[1], RM) = Tp;
+
+    APFloat A1(APFloat::PPCDoubleDouble(), APInt(128, 2, Op1));
+    APFloat A2(APFloat::PPCDoubleDouble(), APInt(128, 2, Op2));
+    A1.divide(A2, RM);
+
+    EXPECT_EQ(Expected[0], A1.bitcastToAPInt().getRawData()[0])
+        << formatv("({0:x} + {1:x}) / ({2:x} + {3:x})", Op1[0], Op1[1], Op2[0],
+                   Op2[1])
+               .str();
+    EXPECT_EQ(Expected[1], A1.bitcastToAPInt().getRawData()[1])
+        << formatv("({0:x} + {1:x}) / ({2:x} + {3:x})", Op1[0], Op1[1], Op2[0],
+                   Op2[1])
+               .str();
+  }
+}
+
+TEST(APFloatTest, PPCDoubleDoubleRemainder) {
+  using DataType =
+      std::tuple<uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t>;
+  DataType Data[] = {
+      // remainder(3.0 + 3.0 << 53, 1.25 + 1.25 << 53) = (0.5 + 0.5 << 53)
+      std::make_tuple(0x4008000000000000ull, 0x3cb8000000000000ull,
+                      0x3ff4000000000000ull, 0x3ca4000000000000ull,
+                      0x3fe0000000000000ull, 0x3c90000000000000ull),
+      // remainder(3.0 + 3.0 << 53, 1.75 + 1.75 << 53) = (-0.5 - 0.5 << 53)
+      std::make_tuple(0x4008000000000000ull, 0x3cb8000000000000ull,
+                      0x3ffc000000000000ull, 0x3cac000000000000ull,
+                      0xbfe0000000000000ull, 0xbc90000000000000ull),
+  };
+
+  for (auto Tp : Data) {
+    uint64_t Op1[2], Op2[2], Expected[2];
+    std::tie(Op1[0], Op1[1], Op2[0], Op2[1], Expected[0], Expected[1]) = Tp;
+
+    APFloat A1(APFloat::PPCDoubleDouble(), APInt(128, 2, Op1));
+    APFloat A2(APFloat::PPCDoubleDouble(), APInt(128, 2, Op2));
+    A1.remainder(A2);
+
+    EXPECT_EQ(Expected[0], A1.bitcastToAPInt().getRawData()[0])
+        << formatv("remainder({0:x} + {1:x}), ({2:x} + {3:x}))", Op1[0], Op1[1],
+                   Op2[0], Op2[1])
+               .str();
+    EXPECT_EQ(Expected[1], A1.bitcastToAPInt().getRawData()[1])
+        << formatv("remainder(({0:x} + {1:x}), ({2:x} + {3:x}))", Op1[0],
+                   Op1[1], Op2[0], Op2[1])
+               .str();
+  }
+}
+
+TEST(APFloatTest, PPCDoubleDoubleMod) {
+  using DataType =
+      std::tuple<uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t>;
+  DataType Data[] = {
+      // mod(3.0 + 3.0 << 53, 1.25 + 1.25 << 53) = (0.5 + 0.5 << 53)
+      std::make_tuple(0x4008000000000000ull, 0x3cb8000000000000ull,
+                      0x3ff4000000000000ull, 0x3ca4000000000000ull,
+                      0x3fe0000000000000ull, 0x3c90000000000000ull),
+      // mod(3.0 + 3.0 << 53, 1.75 + 1.75 << 53) = (1.25 + 1.25 << 53)
+      // 0xbc98000000000000 doesn't seem right, but it's what we currently have.
+      // TODO: investigate
+      std::make_tuple(0x4008000000000000ull, 0x3cb8000000000000ull,
+                      0x3ffc000000000000ull, 0x3cac000000000000ull,
+                      0x3ff4000000000001ull, 0xbc98000000000000ull),
+  };
+
+  for (auto Tp : Data) {
+    uint64_t Op1[2], Op2[2], Expected[2];
+    std::tie(Op1[0], Op1[1], Op2[0], Op2[1], Expected[0], Expected[1]) = Tp;
+
+    APFloat A1(APFloat::PPCDoubleDouble(), APInt(128, 2, Op1));
+    APFloat A2(APFloat::PPCDoubleDouble(), APInt(128, 2, Op2));
+    A1.mod(A2);
+
+    EXPECT_EQ(Expected[0], A1.bitcastToAPInt().getRawData()[0])
+        << formatv("fmod(({0:x} + {1:x}),  ({2:x} + {3:x}))", Op1[0], Op1[1],
+                   Op2[0], Op2[1])
+               .str();
+    EXPECT_EQ(Expected[1], A1.bitcastToAPInt().getRawData()[1])
+        << formatv("fmod(({0:x} + {1:x}), ({2:x} + {3:x}))", Op1[0], Op1[1],
+                   Op2[0], Op2[1])
+               .str();
+  }
+}
+
+TEST(APFloatTest, PPCDoubleDoubleFMA) {
+  // Sanity check for now.
+  APFloat A(APFloat::PPCDoubleDouble(), "2");
+  A.fusedMultiplyAdd(APFloat(APFloat::PPCDoubleDouble(), "3"),
+                     APFloat(APFloat::PPCDoubleDouble(), "4"),
+                     APFloat::rmNearestTiesToEven);
+  EXPECT_EQ(APFloat::cmpEqual,
+            APFloat(APFloat::PPCDoubleDouble(), "10").compare(A));
+}
+
+TEST(APFloatTest, PPCDoubleDoubleRoundToIntegral) {
+  {
+    APFloat A(APFloat::PPCDoubleDouble(), "1.5");
+    A.roundToIntegral(APFloat::rmNearestTiesToEven);
+    EXPECT_EQ(APFloat::cmpEqual,
+              APFloat(APFloat::PPCDoubleDouble(), "2").compare(A));
+  }
+  {
+    APFloat A(APFloat::PPCDoubleDouble(), "2.5");
+    A.roundToIntegral(APFloat::rmNearestTiesToEven);
+    EXPECT_EQ(APFloat::cmpEqual,
+              APFloat(APFloat::PPCDoubleDouble(), "2").compare(A));
+  }
+}
+
+TEST(APFloatTest, PPCDoubleDoubleCompare) {
+  using DataType =
+      std::tuple<uint64_t, uint64_t, uint64_t, uint64_t, APFloat::cmpResult>;
+
+  DataType Data[] = {
+      // (1 + 0) = (1 + 0)
+      std::make_tuple(0x3ff0000000000000ull, 0, 0x3ff0000000000000ull, 0,
+                      APFloat::cmpEqual),
+      // (1 + 0) < (1.00...1 + 0)
+      std::make_tuple(0x3ff0000000000000ull, 0, 0x3ff0000000000001ull, 0,
+                      APFloat::cmpLessThan),
+      // (1.00...1 + 0) > (1 + 0)
+      std::make_tuple(0x3ff0000000000001ull, 0, 0x3ff0000000000000ull, 0,
+                      APFloat::cmpGreaterThan),
+      // (1 + 0) < (1 + epsilon)
+      std::make_tuple(0x3ff0000000000000ull, 0, 0x3ff0000000000001ull,
+                      0x0000000000000001ull, APFloat::cmpLessThan),
+      // NaN != NaN
+      std::make_tuple(0x7ff8000000000000ull, 0, 0x7ff8000000000000ull, 0,
+                      APFloat::cmpUnordered),
+      // (1 + 0) != NaN
+      std::make_tuple(0x3ff0000000000000ull, 0, 0x7ff8000000000000ull, 0,
+                      APFloat::cmpUnordered),
+      // Inf = Inf
+      std::make_tuple(0x7ff0000000000000ull, 0, 0x7ff0000000000000ull, 0,
+                      APFloat::cmpEqual),
+  };
+
+  for (auto Tp : Data) {
+    uint64_t Op1[2], Op2[2];
+    APFloat::cmpResult Expected;
+    std::tie(Op1[0], Op1[1], Op2[0], Op2[1], Expected) = Tp;
+
+    APFloat A1(APFloat::PPCDoubleDouble(), APInt(128, 2, Op1));
+    APFloat A2(APFloat::PPCDoubleDouble(), APInt(128, 2, Op2));
+    EXPECT_EQ(Expected, A1.compare(A2))
+        << formatv("({0:x} + {1:x}) - ({2:x} + {3:x})", Op1[0], Op1[1], Op2[0],
+                   Op2[1])
+               .str();
+  }
+}
+
+TEST(APFloatTest, PPCDoubleDoubleChangeSign) {
+  uint64_t Data[] = {
+      0x400f000000000000ull, 0xbcb0000000000000ull,
+  };
+  APFloat Float(APFloat::PPCDoubleDouble(), APInt(128, 2, Data));
+  {
+    APFloat Actual =
+        APFloat::copySign(Float, APFloat(APFloat::IEEEdouble(), "1"));
+    EXPECT_EQ(0x400f000000000000ull, Actual.bitcastToAPInt().getRawData()[0]);
+    EXPECT_EQ(0xbcb0000000000000ull, Actual.bitcastToAPInt().getRawData()[1]);
+  }
+  {
+    APFloat Actual =
+        APFloat::copySign(Float, APFloat(APFloat::IEEEdouble(), "-1"));
+    EXPECT_EQ(0xc00f000000000000ull, Actual.bitcastToAPInt().getRawData()[0]);
+    EXPECT_EQ(0x3cb0000000000000ull, Actual.bitcastToAPInt().getRawData()[1]);
+  }
+}
+
+TEST(APFloatTest, PPCDoubleDoubleFactories) {
+  {
+    uint64_t Data[] = {
+        0, 0,
+    };
+    EXPECT_EQ(APInt(128, 2, Data),
+              APFloat::getZero(APFloat::PPCDoubleDouble()).bitcastToAPInt());
+  }
+  {
+    uint64_t Data[] = {
+        0x0000000000000001ull, 0,
+    };
+    EXPECT_EQ(
+        APInt(128, 2, Data),
+        APFloat::getSmallest(APFloat::PPCDoubleDouble()).bitcastToAPInt());
+  }
+  {
+    uint64_t Data[] = {0x0360000000000000ull, 0};
+    EXPECT_EQ(APInt(128, 2, Data),
+              APFloat::getSmallestNormalized(APFloat::PPCDoubleDouble())
+                  .bitcastToAPInt());
+  }
+  {
+    uint64_t Data[] = {
+        0x8000000000000000ull, 0x0000000000000000ull,
+    };
+    EXPECT_EQ(
+        APInt(128, 2, Data),
+        APFloat::getZero(APFloat::PPCDoubleDouble(), true).bitcastToAPInt());
+  }
+  {
+    uint64_t Data[] = {
+        0x8000000000000001ull, 0x0000000000000000ull,
+    };
+    EXPECT_EQ(APInt(128, 2, Data),
+              APFloat::getSmallest(APFloat::PPCDoubleDouble(), true)
+                  .bitcastToAPInt());
+  }
+
+  EXPECT_EQ(0x8360000000000000ull,
+            APFloat::getSmallestNormalized(APFloat::PPCDoubleDouble(), true)
+                .bitcastToAPInt()
+                .getRawData()[0]);
+  EXPECT_EQ(0x0000000000000000ull,
+            APFloat::getSmallestNormalized(APFloat::PPCDoubleDouble(), true)
+                .getSecondFloat()
+                .bitcastToAPInt()
+                .getRawData()[0]);
+
+  EXPECT_TRUE(APFloat::getSmallest(APFloat::PPCDoubleDouble()).isSmallest());
+  EXPECT_TRUE(APFloat::getLargest(APFloat::PPCDoubleDouble()).isLargest());
+}
 }
diff --git a/unittests/ADT/IntrusiveRefCntPtrTest.cpp b/unittests/ADT/IntrusiveRefCntPtrTest.cpp
index 143a8cc4910..83dc1376649 100644
--- a/unittests/ADT/IntrusiveRefCntPtrTest.cpp
+++ b/unittests/ADT/IntrusiveRefCntPtrTest.cpp
@@ -15,7 +15,9 @@ namespace llvm {
 namespace {
 struct SimpleRefCounted : public RefCountedBase<SimpleRefCounted> {
   SimpleRefCounted() { ++NumInstances; }
-  SimpleRefCounted(const SimpleRefCounted &) { ++NumInstances; }
+  SimpleRefCounted(const SimpleRefCounted &) : RefCountedBase() {
+    ++NumInstances;
+  }
   ~SimpleRefCounted() { --NumInstances; }
 
   static int NumInstances;
diff --git a/unittests/Bitcode/BitstreamReaderTest.cpp b/unittests/Bitcode/BitstreamReaderTest.cpp
index 704eb803f9c..935ef4bcffc 100644
--- a/unittests/Bitcode/BitstreamReaderTest.cpp
+++ b/unittests/Bitcode/BitstreamReaderTest.cpp
@@ -101,10 +101,10 @@ TEST(BitstreamReaderTest, readRecordWithBlobWhileStreaming) {
       Stream.Emit(Magic, 32);
       Stream.EnterSubblock(BlockID, 3);
 
-      BitCodeAbbrev *Abbrev = new BitCodeAbbrev();
+      auto Abbrev = std::make_shared<BitCodeAbbrev>();
       Abbrev->Add(BitCodeAbbrevOp(RecordID));
       Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
-      AbbrevID = Stream.EmitAbbrev(Abbrev);
+      AbbrevID = Stream.EmitAbbrev(std::move(Abbrev));
       unsigned Record[] = {RecordID};
       Stream.EmitRecordWithBlob(AbbrevID, makeArrayRef(Record), BlobIn);
 
diff --git a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
index ec8297f277f..4435b794268 100644
--- a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
+++ b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -1100,5 +1100,126 @@ TEST(DWARFDebugInfo, TestDWARFDie) {
   EXPECT_FALSE(DefaultDie.getSibling().isValid());
 }
 
+TEST(DWARFDebugInfo, TestChildIterators) {
+  // Test the DWARF APIs related to iterating across the children of a DIE using
+  // the DWARFDie::iterator class.
+  uint16_t Version = 4;
+  
+  const uint8_t AddrSize = sizeof(void *);
+  initLLVMIfNeeded();
+  Triple Triple = getHostTripleForAddrSize(AddrSize);
+  auto ExpectedDG = dwarfgen::Generator::create(Triple, Version);
+  if (HandleExpectedError(ExpectedDG))
+    return;
+  dwarfgen::Generator *DG = ExpectedDG.get().get();
+  dwarfgen::CompileUnit &CU = DG->addCompileUnit();
+  
+  enum class Tag: uint16_t  {
+    A = dwarf::DW_TAG_lo_user,
+    B,
+  };
+  
+  // Scope to allow us to re-use the same DIE names
+  {
+    // Create DWARF tree that looks like:
+    //
+    // CU
+    //   A
+    //   B
+    auto CUDie = CU.getUnitDIE();
+    CUDie.addChild((dwarf::Tag)Tag::A);
+    CUDie.addChild((dwarf::Tag)Tag::B);
+  }
+  
+  MemoryBufferRef FileBuffer(DG->generate(), "dwarf");
+  auto Obj = object::ObjectFile::createObjectFile(FileBuffer);
+  EXPECT_TRUE((bool)Obj);
+  DWARFContextInMemory DwarfContext(*Obj.get());
+  
+  // Verify the number of compile units is correct.
+  uint32_t NumCUs = DwarfContext.getNumCompileUnits();
+  EXPECT_EQ(NumCUs, 1u);
+  DWARFCompileUnit *U = DwarfContext.getCompileUnitAtIndex(0);
+  
+  // Get the compile unit DIE is valid.
+  auto CUDie = U->getUnitDIE(false);
+  EXPECT_TRUE(CUDie.isValid());
+  // CUDie.dump(llvm::outs(), UINT32_MAX);
+  uint32_t Index;
+  DWARFDie A;
+  DWARFDie B;
+  
+  // Verify the compile unit DIE's children.
+  Index = 0;
+  for (auto Die : CUDie.children()) {
+    switch (Index++) {
+      case 0: A = Die; break;
+      case 1: B = Die; break;
+    }
+  }
+  
+  EXPECT_EQ(A.getTag(), (dwarf::Tag)Tag::A);
+  EXPECT_EQ(B.getTag(), (dwarf::Tag)Tag::B);
+
+  // Verify that A has no children by verifying that the begin and end contain
+  // invalid DIEs and also that the iterators are equal.
+  EXPECT_EQ(A.begin(), A.end());
+}
+
+TEST(DWARFDebugInfo, TestChildIteratorsOnInvalidDie) {
+  // Verify that an invalid DIE has no children.
+  DWARFDie Invalid;
+  auto begin = Invalid.begin();
+  auto end = Invalid.end();
+  EXPECT_FALSE(begin->isValid());
+  EXPECT_FALSE(end->isValid());
+  EXPECT_EQ(begin, end);
+}
+
+  
+TEST(DWARFDebugInfo, TestEmptyChildren) {
+  // Test a DIE that says it has children in the abbreviation, but actually
+  // doesn't have any attributes, will not return anything during iteration.
+  // We do this by making sure the begin and end iterators are equal.
+  uint16_t Version = 4;
+  
+  const uint8_t AddrSize = sizeof(void *);
+  initLLVMIfNeeded();
+  Triple Triple = getHostTripleForAddrSize(AddrSize);
+  auto ExpectedDG = dwarfgen::Generator::create(Triple, Version);
+  if (HandleExpectedError(ExpectedDG))
+    return;
+  dwarfgen::Generator *DG = ExpectedDG.get().get();
+  dwarfgen::CompileUnit &CU = DG->addCompileUnit();
+  
+  // Scope to allow us to re-use the same DIE names
+  {
+    // Create a compile unit DIE that has an abbreviation that says it has
+    // children, but doesn't have any actual attributes. This helps us test
+    // a DIE that has only one child: a NULL DIE.
+    auto CUDie = CU.getUnitDIE();
+    CUDie.setForceChildren();
+  }
+  
+  MemoryBufferRef FileBuffer(DG->generate(), "dwarf");
+  auto Obj = object::ObjectFile::createObjectFile(FileBuffer);
+  EXPECT_TRUE((bool)Obj);
+  DWARFContextInMemory DwarfContext(*Obj.get());
+  
+  // Verify the number of compile units is correct.
+  uint32_t NumCUs = DwarfContext.getNumCompileUnits();
+  EXPECT_EQ(NumCUs, 1u);
+  DWARFCompileUnit *U = DwarfContext.getCompileUnitAtIndex(0);
+  
+  // Get the compile unit DIE is valid.
+  auto CUDie = U->getUnitDIE(false);
+  EXPECT_TRUE(CUDie.isValid());
+  CUDie.dump(llvm::outs(), UINT32_MAX);
+  
+  // Verify that the CU Die that says it has children, but doesn't, actually
+  // has begin and end iterators that are equal. We want to make sure we don't
+  // see the Null DIEs during iteration.
+  EXPECT_EQ(CUDie.begin(), CUDie.end());
+}
 
 } // end anonymous namespace
diff --git a/unittests/DebugInfo/DWARF/DwarfGenerator.cpp b/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
index 9a72f70a0f2..9ec43cab4dc 100644
--- a/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
+++ b/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
@@ -108,6 +108,10 @@ dwarfgen::DIE dwarfgen::CompileUnit::getUnitDIE() {
   return dwarfgen::DIE(this, &DU.getUnitDie());
 }
 
+void dwarfgen::DIE::setForceChildren() {
+  Die->setForceChildren(true);
+}
+
 //===----------------------------------------------------------------------===//
 /// dwarfgen::Generator implementation.
 //===----------------------------------------------------------------------===//
diff --git a/unittests/DebugInfo/DWARF/DwarfGenerator.h b/unittests/DebugInfo/DWARF/DwarfGenerator.h
index 966725b4fa4..2978d1ca002 100644
--- a/unittests/DebugInfo/DWARF/DwarfGenerator.h
+++ b/unittests/DebugInfo/DWARF/DwarfGenerator.h
@@ -129,6 +129,9 @@ class DIE {
   /// \returns the newly created DIE object that is now a child owned by this
   /// object.
   dwarfgen::DIE addChild(dwarf::Tag Tag);
+  
+  /// Force a DIE to say it has children even when it doesn't.
+  void setForceChildren();
 };
 
 /// A DWARF compile unit used to generate DWARF compile/type units.
diff --git a/utils/lit/lit/formats/googletest.py b/utils/lit/lit/formats/googletest.py
index fddb2f5f684..29a92c4e960 100644
--- a/utils/lit/lit/formats/googletest.py
+++ b/utils/lit/lit/formats/googletest.py
@@ -43,7 +43,13 @@ def getGTestTests(self, path, litConfig, localConfig):
 
         nested_tests = []
         for ln in lines:
-            if not ln.strip():
+            # The test name list includes trailing comments beginning with
+            # a '#' on some lines, so skip those. We don't support test names
+            # that use escaping to embed '#' into their name as the names come
+            # from C++ class and method names where such things are hard and
+            # uninteresting to support.
+            ln = ln.split('#', 1)[0].rstrip()
+            if not ln.lstrip():
                 continue
 
             if 'Running main() from gtest_main.cc' in ln:
diff --git a/utils/unittest/CMakeLists.txt b/utils/unittest/CMakeLists.txt
index 933399e888a..dadca65b3ae 100644
--- a/utils/unittest/CMakeLists.txt
+++ b/utils/unittest/CMakeLists.txt
@@ -24,6 +24,12 @@ endif()
 if(SUPPORTS_NO_VARIADIC_MACROS_FLAG)
   add_definitions("-Wno-variadic-macros")
 endif()
+if(SUPPORTS_NO_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
+  add_definitions("-Wno-gnu-zero-variadic-macro-arguments")
+endif()
+if(CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG)
+  add_definitions("-Wno-covered-switch-default")
+endif()
 
 set(LLVM_REQUIRES_RTTI 1)
 add_definitions( -DGTEST_HAS_RTTI=0 )
diff --git a/utils/unittest/googletest/README.LLVM b/utils/unittest/googletest/README.LLVM
index afaae9963f3..06c80fea067 100644
--- a/utils/unittest/googletest/README.LLVM
+++ b/utils/unittest/googletest/README.LLVM
@@ -1,24 +1,20 @@
 LLVM notes
 ----------
 
-This directory contains Google Test 1.6.0, with all elements removed except for
+This directory contains Google Test 1.8.0, with all elements removed except for
 the actual source code, to minimize the addition to the LLVM distribution.
 
 Cleaned up as follows:
 
 # Remove all the unnecessary files and directories
-$ rm -f aclocal* CMakeLists.txt configure* Makefile* CHANGES CONTRIBUTORS README
-$ rm -rf build-aux cmake codegear fused-src m4 make msvc samples scripts test xcode
+$ rm -f CMakeLists.txt configure* Makefile* CHANGES CONTRIBUTORS README README.md .gitignore
+$ rm -rf build-aux cmake codegear m4 make msvc samples scripts test xcode docs
 $ rm -f `find . -name \*\.pump`
 $ rm -f src/gtest_main.cc
 
 # Put the license in the consistent place for LLVM.
-$ mv COPYING LICENSE.TXT
+$ mv LICENSE LICENSE.TXT
 
 Modified as follows:
-* Added support for FreeBSD.
-* Added support for Minix (PR6797).
-* To GTestStreamToHelper in include/gtest/internal/gtest-internal.h,
-  added the ability to stream with raw_os_ostream.
-* To refresh Haiku support in include/gtest/internal/gtest-port.h,
-  see http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20100621/102898.html
+* Added support for Minix and Haiku.
+* Added raw_os_ostream support to include/gtest/internal/custom/gtest-printers.h.
diff --git a/utils/unittest/googletest/include/gtest/gtest-death-test.h b/utils/unittest/googletest/include/gtest/gtest-death-test.h
index a27883f0a4d..957a69c6a9e 100644
--- a/utils/unittest/googletest/include/gtest/gtest-death-test.h
+++ b/utils/unittest/googletest/include/gtest/gtest-death-test.h
@@ -51,6 +51,17 @@ GTEST_DECLARE_string_(death_test_style);
 
 #if GTEST_HAS_DEATH_TEST
 
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
 // The following macros are useful for writing death tests.
 
 // Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
@@ -75,7 +86,7 @@ GTEST_DECLARE_string_(death_test_style);
 //   for (int i = 0; i < 5; i++) {
 //     EXPECT_DEATH(server.ProcessRequest(i),
 //                  "Invalid request .* in ProcessRequest()")
-//         << "Failed to die on request " << i);
+//                  << "Failed to die on request " << i;
 //   }
 //
 //   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
@@ -245,10 +256,10 @@ class GTEST_API_ KilledBySignal {
 # ifdef NDEBUG
 
 #  define EXPECT_DEBUG_DEATH(statement, regex) \
-  do { statement; } while (::testing::internal::AlwaysFalse())
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
 
 #  define ASSERT_DEBUG_DEATH(statement, regex) \
-  do { statement; } while (::testing::internal::AlwaysFalse())
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
 
 # else
 
diff --git a/utils/unittest/googletest/include/gtest/gtest-message.h b/utils/unittest/googletest/include/gtest/gtest-message.h
index 9b7142f3207..47ed669a9b1 100644
--- a/utils/unittest/googletest/include/gtest/gtest-message.h
+++ b/utils/unittest/googletest/include/gtest/gtest-message.h
@@ -48,8 +48,41 @@
 
 #include <limits>
 
-#include "gtest/internal/gtest-string.h"
-#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+#if !GTEST_NO_LLVM_RAW_OSTREAM
+#include "llvm/Support/raw_os_ostream.h"
+
+// LLVM INTERNAL CHANGE: To allow operator<< to work with both
+// std::ostreams and LLVM's raw_ostreams, we define a special
+// std::ostream with an implicit conversion to raw_ostream& and stream
+// to that.  This causes the compiler to prefer std::ostream overloads
+// but still find raw_ostream& overloads.
+namespace llvm {
+class convertible_fwd_ostream : public std::ostream {
+  raw_os_ostream ros_;
+
+public:
+  convertible_fwd_ostream(std::ostream& os)
+    : std::ostream(os.rdbuf()), ros_(*this) {}
+  operator raw_ostream&() { return ros_; }
+};
+}
+template <typename T>
+inline void GTestStreamToHelper(std::ostream& os, const T& val) {
+  llvm::convertible_fwd_ostream cos(os);
+  cos << val;
+}
+#else
+template <typename T>
+inline void GTestStreamToHelper(std::ostream& os, const T& val) {
+  os << val;
+}
+#endif
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
 
 namespace testing {
 
@@ -87,15 +120,7 @@ class GTEST_API_ Message {
 
  public:
   // Constructs an empty Message.
-  // We allocate the stringstream separately because otherwise each use of
-  // ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
-  // stack frame leading to huge stack frames in some cases; gcc does not reuse
-  // the stack space.
-  Message() : ss_(new ::std::stringstream) {
-    // By default, we want there to be enough precision when printing
-    // a double to a Message.
-    *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
-  }
+  Message();
 
   // Copy constructor.
   Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
@@ -118,7 +143,26 @@ class GTEST_API_ Message {
   // Streams a non-pointer value to this object.
   template <typename T>
   inline Message& operator <<(const T& val) {
-    ::GTestStreamToHelper(ss_.get(), val);
+    // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+#if GTEST_NO_LLVM_RAW_OSTREAM
+    using ::operator <<;
+    *ss_ << val;
+#else
+    ::GTestStreamToHelper(*ss_, val);
+#endif
     return *this;
   }
 
@@ -140,7 +184,11 @@ class GTEST_API_ Message {
     if (pointer == NULL) {
       *ss_ << "(null)";
     } else {
-      ::GTestStreamToHelper(ss_.get(), pointer);
+#if GTEST_NO_LLVM_RAW_OSTREAM
+      *ss_ << pointer;
+#else
+      ::GTestStreamToHelper(*ss_, pointer);
+#endif
     }
     return *this;
   }
@@ -164,12 +212,8 @@ class GTEST_API_ Message {
 
   // These two overloads allow streaming a wide C string to a Message
   // using the UTF-8 encoding.
-  Message& operator <<(const wchar_t* wide_c_str) {
-    return *this << internal::String::ShowWideCString(wide_c_str);
-  }
-  Message& operator <<(wchar_t* wide_c_str) {
-    return *this << internal::String::ShowWideCString(wide_c_str);
-  }
+  Message& operator <<(const wchar_t* wide_c_str);
+  Message& operator <<(wchar_t* wide_c_str);
 
 #if GTEST_HAS_STD_WSTRING
   // Converts the given wide string to a narrow string using the UTF-8
@@ -183,13 +227,11 @@ class GTEST_API_ Message {
   Message& operator <<(const ::wstring& wstr);
 #endif  // GTEST_HAS_GLOBAL_WSTRING
 
-  // Gets the text streamed to this object so far as a String.
+  // Gets the text streamed to this object so far as an std::string.
   // Each '\0' character in the buffer is replaced with "\\0".
   //
   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  internal::String GetString() const {
-    return internal::StringStreamToString(ss_.get());
-  }
+  std::string GetString() const;
 
  private:
 
@@ -199,16 +241,20 @@ class GTEST_API_ Message {
   // decide between class template specializations for T and T*, so a
   // tr1::type_traits-like is_pointer works, and we can overload on that.
   template <typename T>
-  inline void StreamHelper(internal::true_type /*dummy*/, T* pointer) {
+  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
     if (pointer == NULL) {
       *ss_ << "(null)";
     } else {
-      ::GTestStreamToHelper(ss_.get(), pointer);
+      *ss_ << pointer;
     }
   }
   template <typename T>
-  inline void StreamHelper(internal::false_type /*dummy*/, const T& value) {
-    ::GTestStreamToHelper(ss_.get(), value);
+  inline void StreamHelper(internal::false_type /*is_pointer*/,
+                           const T& value) {
+    // See the comments in Message& operator <<(const T&) above for why
+    // we need this using statement.
+    using ::operator <<;
+    *ss_ << value;
   }
 #endif  // GTEST_OS_SYMBIAN
 
@@ -225,6 +271,18 @@ inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
   return os << sb.GetString();
 }
 
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
 }  // namespace testing
 
 #endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/utils/unittest/googletest/include/gtest/gtest-param-test.h b/utils/unittest/googletest/include/gtest/gtest-param-test.h
index b1f8bb96102..038f9ba79eb 100644
--- a/utils/unittest/googletest/include/gtest/gtest-param-test.h
+++ b/utils/unittest/googletest/include/gtest/gtest-param-test.h
@@ -190,6 +190,7 @@ TEST_P(DerivedTest, DoesBlah) {
 // inside #if GTEST_HAS_PARAM_TEST.
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-param-util-generated.h"
 
 #if GTEST_HAS_PARAM_TEST
 
@@ -324,12 +325,6 @@ internal::ParamGenerator<typename Container::value_type> ValuesIn(
   return ValuesIn(container.begin(), container.end());
 }
 
-} // namespace testing
-
-#include <gtest/internal/gtest-param-util-generated.h>
-
-namespace testing {
-
 // Values() allows generating tests from explicitly specified list of
 // parameters.
 //
@@ -1262,7 +1257,7 @@ inline internal::ParamGenerator<bool> Bool() {
 // Boolean flags:
 //
 // class FlagDependentTest
-//     : public testing::TestWithParam<tuple(bool, bool)> > {
+//     : public testing::TestWithParam<tuple<bool, bool> > {
 //   virtual void SetUp() {
 //     // Assigns external_flag_1 and external_flag_2 values from the tuple.
 //     tie(external_flag_1, external_flag_2) = GetParam();
@@ -1392,14 +1387,17 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
     static int AddToRegistry() { \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
-                  #test_case_name, \
-                  #test_name, \
-                  new ::testing::internal::TestMetaFactory< \
-                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestPattern(\
+                      #test_case_name, \
+                      #test_name, \
+                      new ::testing::internal::TestMetaFactory< \
+                          GTEST_TEST_CLASS_NAME_(\
+                              test_case_name, test_name)>()); \
       return 0; \
     } \
-    static int gtest_registering_dummy_; \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
     GTEST_DISALLOW_COPY_AND_ASSIGN_(\
         GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
   }; \
@@ -1408,16 +1406,36 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
       GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
   void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 
-# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
+// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user
+// to specify a function or functor that generates custom test name suffixes
+// based on the test parameters. The function should accept one argument of
+// type testing::TestParamInfo<class ParamType>, and return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()). It does not work
+// for std::string or C strings.
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore.
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
   ::testing::internal::ParamGenerator<test_case_name::ParamType> \
       gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  int gtest_##prefix##test_case_name##_dummy_ = \
+  ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
+      const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
+    return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
+        (__VA_ARGS__)(info); \
+  } \
+  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
-                  #prefix, \
-                  &gtest_##prefix##test_case_name##_EvalGenerator_, \
-                  __FILE__, __LINE__)
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestCaseInstantiation(\
+                      #prefix, \
+                      &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                      &gtest_##prefix##test_case_name##_EvalGenerateName_, \
+                      __FILE__, __LINE__)
 
 }  // namespace testing
 
diff --git a/utils/unittest/googletest/include/gtest/gtest-printers.h b/utils/unittest/googletest/include/gtest/gtest-printers.h
index 9cbab3ff4be..8a33164cb38 100644
--- a/utils/unittest/googletest/include/gtest/gtest-printers.h
+++ b/utils/unittest/googletest/include/gtest/gtest-printers.h
@@ -103,6 +103,10 @@
 #include "gtest/internal/gtest-port.h"
 #include "gtest/internal/gtest-internal.h"
 
+#if GTEST_HAS_STD_TUPLE_
+# include <tuple>
+#endif
+
 namespace testing {
 
 // Definitions in the 'internal' and 'internal2' name spaces are
@@ -250,6 +254,103 @@ void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
 namespace testing {
 namespace internal {
 
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
+#endif
+
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
+#endif
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
 // UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
 // value to the given ostream.  The caller must ensure that
 // 'ostream_ptr' is not NULL, or the behavior is undefined.
@@ -480,14 +581,16 @@ inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
 }
 #endif  // GTEST_HAS_STD_WSTRING
 
-#if GTEST_HAS_TR1_TUPLE
-// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
-// which are packed as tuples.
-
+#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
 // Helper function for printing a tuple.  T must be instantiated with
 // a tuple type.
 template <typename T>
 void PrintTupleTo(const T& t, ::std::ostream* os);
+#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+
+#if GTEST_HAS_TR1_TUPLE
+// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
+// which are packed as tuples.
 
 // Overloaded PrintTo() for tuples of various arities.  We support
 // tuples of up-to 10 fields.  The following implementation works
@@ -561,6 +664,13 @@ void PrintTo(
 }
 #endif  // GTEST_HAS_TR1_TUPLE
 
+#if GTEST_HAS_STD_TUPLE_
+template <typename... Types>
+void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_STD_TUPLE_
+
 // Overload for std::pair.
 template <typename T1, typename T2>
 void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
@@ -580,10 +690,7 @@ class UniversalPrinter {
  public:
   // MSVC warns about adding const to a function type, so we want to
   // disable the warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4180)  // Temporarily disables warning 4180.
-#endif  // _MSC_VER
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
 
   // Note: we deliberately don't call this PrintTo(), as that name
   // conflicts with ::testing::internal::PrintTo in the body of the
@@ -600,9 +707,7 @@ class UniversalPrinter {
     PrintTo(value, os);
   }
 
-#ifdef _MSC_VER
-# pragma warning(pop)           // Restores the warning state.
-#endif  // _MSC_VER
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
 };
 
 // UniversalPrintArray(begin, len, os) prints an array of 'len'
@@ -630,9 +735,12 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
   }
 }
 // This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(const char* begin,
-                                    size_t len,
-                                    ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(
+    const char* begin, size_t len, ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const wchar_t* begin, size_t len, ::std::ostream* os);
 
 // Implements printing an array type T[N].
 template <typename T, size_t N>
@@ -651,10 +759,7 @@ class UniversalPrinter<T&> {
  public:
   // MSVC warns about adding const to a function type, so we want to
   // disable the warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4180)  // Temporarily disables warning 4180.
-#endif  // _MSC_VER
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
 
   static void Print(const T& value, ::std::ostream* os) {
     // Prints the address of the value.  We use reinterpret_cast here
@@ -665,27 +770,78 @@ class UniversalPrinter<T&> {
     UniversalPrint(value, os);
   }
 
-#ifdef _MSC_VER
-# pragma warning(pop)           // Restores the warning state.
-#endif  // _MSC_VER
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
 };
 
 // Prints a value tersely: for a reference type, the referenced value
 // (but not the address) is printed; for a (const) char pointer, the
 // NUL-terminated string (but not the pointer) is printed.
+
 template <typename T>
-void UniversalTersePrint(const T& value, ::std::ostream* os) {
-  UniversalPrint(value, os);
-}
-inline void UniversalTersePrint(const char* str, ::std::ostream* os) {
-  if (str == NULL) {
-    *os << "NULL";
-  } else {
-    UniversalPrint(string(str), os);
+class UniversalTersePrinter {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
   }
-}
-inline void UniversalTersePrint(char* str, ::std::ostream* os) {
-  UniversalTersePrint(static_cast<const char*>(str), os);
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream* os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+  static void Print(const char* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char*> {
+ public:
+  static void Print(char* str, ::std::ostream* os) {
+    UniversalTersePrinter<const char*>::Print(str, os);
+  }
+};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+  static void Print(const wchar_t* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+  UniversalTersePrinter<T>::Print(value, os);
 }
 
 // Prints a value using the type inferred by the compiler.  The
@@ -694,19 +850,71 @@ inline void UniversalTersePrint(char* str, ::std::ostream* os) {
 // NUL-terminated string.
 template <typename T>
 void UniversalPrint(const T& value, ::std::ostream* os) {
-  UniversalPrinter<T>::Print(value, os);
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
 }
 
-#if GTEST_HAS_TR1_TUPLE
 typedef ::std::vector<string> Strings;
 
+// TuplePolicy<TupleT> must provide:
+// - tuple_size
+//     size of tuple TupleT.
+// - get<size_t I>(const TupleT& t)
+//     static function extracting element I of tuple TupleT.
+// - tuple_element<size_t I>::type
+//     type of element I of tuple TupleT.
+template <typename TupleT>
+struct TuplePolicy;
+
+#if GTEST_HAS_TR1_TUPLE
+template <typename TupleT>
+struct TuplePolicy {
+  typedef TupleT Tuple;
+  static const size_t tuple_size = ::std::tr1::tuple_size<Tuple>::value;
+
+  template <size_t I>
+  struct tuple_element : ::std::tr1::tuple_element<I, Tuple> {};
+
+  template <size_t I>
+  static typename AddReference<
+      const typename ::std::tr1::tuple_element<I, Tuple>::type>::type get(
+      const Tuple& tuple) {
+    return ::std::tr1::get<I>(tuple);
+  }
+};
+template <typename TupleT>
+const size_t TuplePolicy<TupleT>::tuple_size;
+#endif  // GTEST_HAS_TR1_TUPLE
+
+#if GTEST_HAS_STD_TUPLE_
+template <typename... Types>
+struct TuplePolicy< ::std::tuple<Types...> > {
+  typedef ::std::tuple<Types...> Tuple;
+  static const size_t tuple_size = ::std::tuple_size<Tuple>::value;
+
+  template <size_t I>
+  struct tuple_element : ::std::tuple_element<I, Tuple> {};
+
+  template <size_t I>
+  static const typename ::std::tuple_element<I, Tuple>::type& get(
+      const Tuple& tuple) {
+    return ::std::get<I>(tuple);
+  }
+};
+template <typename... Types>
+const size_t TuplePolicy< ::std::tuple<Types...> >::tuple_size;
+#endif  // GTEST_HAS_STD_TUPLE_
+
+#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
 // This helper template allows PrintTo() for tuples and
 // UniversalTersePrintTupleFieldsToStrings() to be defined by
 // induction on the number of tuple fields.  The idea is that
 // TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
 // fields in tuple t, and can be defined in terms of
 // TuplePrefixPrinter<N - 1>.
-
+//
 // The inductive case.
 template <size_t N>
 struct TuplePrefixPrinter {
@@ -714,9 +922,14 @@ struct TuplePrefixPrinter {
   template <typename Tuple>
   static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
     TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
-    *os << ", ";
-    UniversalPrinter<typename ::std::tr1::tuple_element<N - 1, Tuple>::type>
-        ::Print(::std::tr1::get<N - 1>(t), os);
+    GTEST_INTENTIONAL_CONST_COND_PUSH_()
+    if (N > 1) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+      *os << ", ";
+    }
+    UniversalPrinter<
+        typename TuplePolicy<Tuple>::template tuple_element<N - 1>::type>
+        ::Print(TuplePolicy<Tuple>::template get<N - 1>(t), os);
   }
 
   // Tersely prints the first N fields of a tuple to a string vector,
@@ -725,12 +938,12 @@ struct TuplePrefixPrinter {
   static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
     TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
     ::std::stringstream ss;
-    UniversalTersePrint(::std::tr1::get<N - 1>(t), &ss);
+    UniversalTersePrint(TuplePolicy<Tuple>::template get<N - 1>(t), &ss);
     strings->push_back(ss.str());
   }
 };
 
-// Base cases.
+// Base case.
 template <>
 struct TuplePrefixPrinter<0> {
   template <typename Tuple>
@@ -739,34 +952,13 @@ struct TuplePrefixPrinter<0> {
   template <typename Tuple>
   static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
 };
-// We have to specialize the entire TuplePrefixPrinter<> class
-// template here, even though the definition of
-// TersePrintPrefixToStrings() is the same as the generic version, as
-// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't
-// support specializing a method template of a class template.
-template <>
-struct TuplePrefixPrinter<1> {
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
-    UniversalPrinter<typename ::std::tr1::tuple_element<0, Tuple>::type>::
-        Print(::std::tr1::get<0>(t), os);
-  }
 
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
-    ::std::stringstream ss;
-    UniversalTersePrint(::std::tr1::get<0>(t), &ss);
-    strings->push_back(ss.str());
-  }
-};
-
-// Helper function for printing a tuple.  T must be instantiated with
-// a tuple type.
-template <typename T>
-void PrintTupleTo(const T& t, ::std::ostream* os) {
+// Helper function for printing a tuple.
+// Tuple must be either std::tr1::tuple or std::tuple type.
+template <typename Tuple>
+void PrintTupleTo(const Tuple& t, ::std::ostream* os) {
   *os << "(";
-  TuplePrefixPrinter< ::std::tr1::tuple_size<T>::value>::
-      PrintPrefixTo(t, os);
+  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::PrintPrefixTo(t, os);
   *os << ")";
 }
 
@@ -776,21 +968,26 @@ void PrintTupleTo(const T& t, ::std::ostream* os) {
 template <typename Tuple>
 Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
   Strings result;
-  TuplePrefixPrinter< ::std::tr1::tuple_size<Tuple>::value>::
+  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::
       TersePrintPrefixToStrings(value, &result);
   return result;
 }
-#endif  // GTEST_HAS_TR1_TUPLE
+#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
 
 }  // namespace internal
 
 template <typename T>
 ::std::string PrintToString(const T& value) {
   ::std::stringstream ss;
-  internal::UniversalTersePrint(value, &ss);
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
   return ss.str();
 }
 
 }  // namespace testing
 
+// Include any custom printer added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gtest/internal/custom/gtest-printers.h"
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/utils/unittest/googletest/include/gtest/gtest-spi.h b/utils/unittest/googletest/include/gtest/gtest-spi.h
index 736f692e48f..f63fa9a1b2a 100644
--- a/utils/unittest/googletest/include/gtest/gtest-spi.h
+++ b/utils/unittest/googletest/include/gtest/gtest-spi.h
@@ -68,15 +68,14 @@ class GTEST_API_ ScopedFakeTestPartResultReporter
                                    TestPartResultArray* result);
 
   // The d'tor restores the previous test part result reporter.
-  ~ScopedFakeTestPartResultReporter() override;
+  virtual ~ScopedFakeTestPartResultReporter();
 
   // Appends the TestPartResult object to the TestPartResultArray
   // received in the constructor.
   //
   // This method is from the TestPartResultReporterInterface
   // interface.
-  void ReportTestPartResult(const TestPartResult &result) override;
-
+  virtual void ReportTestPartResult(const TestPartResult& result);
  private:
   void Init();
 
@@ -224,7 +223,7 @@ class GTEST_API_ SingleFailureChecker {
         (substr));\
     {\
       ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS,\
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
           &gtest_failures);\
       if (::testing::internal::AlwaysTrue()) { statement; }\
     }\
diff --git a/utils/unittest/googletest/include/gtest/gtest-test-part.h b/utils/unittest/googletest/include/gtest/gtest-test-part.h
index d2410c00a3f..77eb844839d 100644
--- a/utils/unittest/googletest/include/gtest/gtest-test-part.h
+++ b/utils/unittest/googletest/include/gtest/gtest-test-part.h
@@ -62,7 +62,7 @@ class GTEST_API_ TestPartResult {
                  int a_line_number,
                  const char* a_message)
       : type_(a_type),
-        file_name_(a_file_name),
+        file_name_(a_file_name == NULL ? "" : a_file_name),
         line_number_(a_line_number),
         summary_(ExtractSummary(a_message)),
         message_(a_message) {
@@ -73,7 +73,9 @@ class GTEST_API_ TestPartResult {
 
   // Gets the name of the source file where the test part took place, or
   // NULL if it's unknown.
-  const char* file_name() const { return file_name_.c_str(); }
+  const char* file_name() const {
+    return file_name_.empty() ? NULL : file_name_.c_str();
+  }
 
   // Gets the line in the source file where the test part took place,
   // or -1 if it's unknown.
@@ -96,21 +98,22 @@ class GTEST_API_ TestPartResult {
 
   // Returns true iff the test part fatally failed.
   bool fatally_failed() const { return type_ == kFatalFailure; }
+
  private:
   Type type_;
 
   // Gets the summary of the failure message by omitting the stack
   // trace in it.
-  static internal::String ExtractSummary(const char* message);
+  static std::string ExtractSummary(const char* message);
 
   // The name of the source file where the test part took place, or
-  // NULL if the source file is unknown.
-  internal::String file_name_;
+  // "" if the source file is unknown.
+  std::string file_name_;
   // The line in the source file where the test part took place, or -1
   // if the line number is unknown.
   int line_number_;
-  internal::String summary_;  // The test failure summary.
-  internal::String message_;  // The test failure message.
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
 };
 
 // Prints a TestPartResult object.
@@ -142,7 +145,7 @@ class GTEST_API_ TestPartResultArray {
 // This interface knows how to report a test part result.
 class TestPartResultReporterInterface {
  public:
-  virtual ~TestPartResultReporterInterface();
+  virtual ~TestPartResultReporterInterface() {}
 
   virtual void ReportTestPartResult(const TestPartResult& result) = 0;
 };
@@ -159,8 +162,8 @@ class GTEST_API_ HasNewFatalFailureHelper
     : public TestPartResultReporterInterface {
  public:
   HasNewFatalFailureHelper();
-  ~HasNewFatalFailureHelper() override;
-  void ReportTestPartResult(const TestPartResult &result) override;
+  virtual ~HasNewFatalFailureHelper();
+  virtual void ReportTestPartResult(const TestPartResult& result);
   bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
  private:
   bool has_new_fatal_failure_;
diff --git a/utils/unittest/googletest/include/gtest/gtest-typed-test.h b/utils/unittest/googletest/include/gtest/gtest-typed-test.h
index 6ded1b124b4..5f69d5678ea 100644
--- a/utils/unittest/googletest/include/gtest/gtest-typed-test.h
+++ b/utils/unittest/googletest/include/gtest/gtest-typed-test.h
@@ -181,7 +181,8 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
           ::testing::internal::TemplateSel< \
               GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
           GTEST_TYPE_PARAMS_(CaseName)>::Register(\
-              "", #CaseName, #TestName, 0); \
+              "", ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+              #CaseName, #TestName, 0); \
   template <typename gtest_TypeParam_> \
   void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
 
@@ -236,8 +237,6 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
   template <typename gtest_TypeParam_> \
   void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
 
-// Silencing C99 build warnings
-#if 0
 # define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
   namespace GTEST_CASE_NAMESPACE_(CaseName) { \
   typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
@@ -245,7 +244,6 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
   static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
       GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
           __FILE__, __LINE__, #__VA_ARGS__)
-#endif
 
 // The 'Types' template argument below must have spaces around it
 // since some compilers may choke on '>>' when passing a template
@@ -255,7 +253,10 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
       ::testing::internal::TypeParameterizedTestCase<CaseName, \
           GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
           ::testing::internal::TypeList< Types >::type>::Register(\
-              #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+              #Prefix, \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+              &GTEST_TYPED_TEST_CASE_P_STATE_(CaseName), \
+              #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
 
 #endif  // GTEST_HAS_TYPED_TEST_P
 
diff --git a/utils/unittest/googletest/include/gtest/gtest.h b/utils/unittest/googletest/include/gtest/gtest.h
index 92ca5cc91c8..f846c5bd669 100644
--- a/utils/unittest/googletest/include/gtest/gtest.h
+++ b/utils/unittest/googletest/include/gtest/gtest.h
@@ -52,6 +52,7 @@
 #define GTEST_INCLUDE_GTEST_GTEST_H_
 
 #include <limits>
+#include <ostream>
 #include <vector>
 
 #include "gtest/internal/gtest-internal.h"
@@ -69,14 +70,14 @@
 // class ::string, which has the same interface as ::std::string, but
 // has a different implementation.
 //
-// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
+// You can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
 // ::string is available AND is a distinct type to ::std::string, or
 // define it to 0 to indicate otherwise.
 //
-// If the user's ::std::string and ::string are the same class due to
-// aliasing, they should define GTEST_HAS_GLOBAL_STRING to 0.
+// If ::std::string and ::string are the same class on your platform
+// due to aliasing, you should define GTEST_HAS_GLOBAL_STRING to 0.
 //
-// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined
+// If you do not define GTEST_HAS_GLOBAL_STRING, it is defined
 // heuristically.
 
 namespace testing {
@@ -153,25 +154,15 @@ class ExecDeathTest;
 class NoExecDeathTest;
 class FinalSuccessChecker;
 class GTestFlagSaver;
+class StreamingListenerTest;
 class TestResultAccessor;
 class TestEventListenersAccessor;
 class TestEventRepeater;
+class UnitTestRecordPropertyTestHelper;
 class WindowsDeathTest;
 class UnitTestImpl* GetUnitTestImpl();
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const String& message);
-
-// Converts a streamable value to a String.  A NULL pointer is
-// converted to "(null)".  When the input value is a ::string,
-// ::std::string, ::wstring, or ::std::wstring object, each NUL
-// character in it is replaced with "\\0".
-// Declared in gtest-internal.h but defined here, so that it has access
-// to the definition of the Message class, required by the ARM
-// compiler.
-template <typename T>
-String StreamableToString(const T& streamable) {
-  return (Message() << streamable).GetString();
-}
+                                    const std::string& message);
 
 }  // namespace internal
 
@@ -267,8 +258,31 @@ class GTEST_API_ AssertionResult {
   // Copy constructor.
   // Used in EXPECT_TRUE/FALSE(assertion_result).
   AssertionResult(const AssertionResult& other);
+
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+
   // Used in the EXPECT_TRUE/FALSE(bool_expression).
-  explicit AssertionResult(bool success) : success_(success) {}
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename internal::EnableIf<
+          !internal::ImplicitlyConvertible<T, AssertionResult>::value>::type*
+          /*enabler*/ = NULL)
+      : success_(success) {}
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
 
   // Returns true iff the assertion succeeded.
   operator bool() const { return success_; }  // NOLINT
@@ -309,6 +323,9 @@ class GTEST_API_ AssertionResult {
     message_->append(a_message.GetString().c_str());
   }
 
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
   // Stores result of the assertion predicate.
   bool success_;
   // Stores the message describing the condition in case the expectation
@@ -316,8 +333,6 @@ class GTEST_API_ AssertionResult {
   // Referenced via a pointer to avoid taking too much stack frame space
   // with test assertions.
   internal::scoped_ptr< ::std::string> message_;
-
-  GTEST_DISALLOW_ASSIGN_(AssertionResult);
 };
 
 // Makes a successful assertion result.
@@ -344,8 +359,8 @@ GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
 //
 //   class FooTest : public testing::Test {
 //    protected:
-//     virtual void SetUp() { ... }
-//     virtual void TearDown() { ... }
+//     void SetUp() override { ... }
+//     void TearDown() override { ... }
 //     ...
 //   };
 //
@@ -391,20 +406,21 @@ class GTEST_API_ Test {
   // non-fatal) failure.
   static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
 
-  // Logs a property for the current test.  Only the last value for a given
-  // key is remembered.
-  // These are public static so they can be called from utility functions
-  // that are not members of the test fixture.
-  // The arguments are const char* instead strings, as Google Test is used
-  // on platforms where string doesn't compile.
-  //
-  // Note that a driving consideration for these RecordProperty methods
-  // was to produce xml output suited to the Greenspan charting utility,
-  // which at present will only chart values that fit in a 32-bit int. It
-  // is the user's responsibility to restrict their values to 32-bit ints
-  // if they intend them to be used with Greenspan.
-  static void RecordProperty(const char* key, const char* value);
-  static void RecordProperty(const char* key, int value);
+  // Logs a property for the current test, test case, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test case.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
 
  protected:
   // Creates a Test object.
@@ -436,20 +452,19 @@ class GTEST_API_ Test {
   // internal method to avoid clashing with names used in user TESTs.
   void DeleteSelf_() { delete this; }
 
-  // Uses a GTestFlagSaver to save and restore all Google Test flags.
-  const internal::GTestFlagSaver* const gtest_flag_saver_;
+  const internal::scoped_ptr< GTEST_FLAG_SAVER_ > gtest_flag_saver_;
 
-  // Often a user mis-spells SetUp() as Setup() and spends a long time
+  // Often a user misspells SetUp() as Setup() and spends a long time
   // wondering why it is never called by Google Test.  The declaration of
   // the following method is solely for catching such an error at
   // compile time:
   //
   //   - The return type is deliberately chosen to be not void, so it
-  //   will be a conflict if a user declares void Setup() in his test
-  //   fixture.
+  //   will be a conflict if void Setup() is declared in the user's
+  //   test fixture.
   //
   //   - This method is private, so it will be another compiler error
-  //   if a user calls it from his test fixture.
+  //   if the method is called from the user's test fixture.
   //
   // DO NOT OVERRIDE THIS FUNCTION.
   //
@@ -473,7 +488,7 @@ class TestProperty {
   // C'tor.  TestProperty does NOT have a default constructor.
   // Always use this constructor (with parameters) to create a
   // TestProperty object.
-  TestProperty(const char* a_key, const char* a_value) :
+  TestProperty(const std::string& a_key, const std::string& a_value) :
     key_(a_key), value_(a_value) {
   }
 
@@ -488,15 +503,15 @@ class TestProperty {
   }
 
   // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const char* new_value) {
+  void SetValue(const std::string& new_value) {
     value_ = new_value;
   }
 
  private:
   // The key supplied by the user.
-  internal::String key_;
+  std::string key_;
   // The value supplied by the user.
-  internal::String value_;
+  std::string value_;
 };
 
 // The result of a single Test.  This includes a list of
@@ -547,6 +562,7 @@ class GTEST_API_ TestResult {
 
  private:
   friend class TestInfo;
+  friend class TestCase;
   friend class UnitTest;
   friend class internal::DefaultGlobalTestPartResultReporter;
   friend class internal::ExecDeathTest;
@@ -571,13 +587,16 @@ class GTEST_API_ TestResult {
   // a non-fatal failure if invalid (e.g., if it conflicts with reserved
   // key names). If a property is already recorded for the same key, the
   // value will be updated, rather than storing multiple values for the same
-  // key.
-  void RecordProperty(const TestProperty& test_property);
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
 
   // Adds a failure if the key is a reserved attribute of Google Test
   // testcase tags.  Returns true if the property is valid.
   // TODO(russr): Validate attribute names are legal and human readable.
-  static bool ValidateTestProperty(const TestProperty& test_property);
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
 
   // Adds a test part result to the list.
   void AddTestPartResult(const TestPartResult& test_part_result);
@@ -650,9 +669,15 @@ class GTEST_API_ TestInfo {
     return NULL;
   }
 
-  // Returns true if this test should run, that is if the test is not disabled
-  // (or it is disabled but the also_run_disabled_tests flag has been specified)
-  // and its full name matches the user-specified filter.
+  // Returns the file name where this test is defined.
+  const char* file() const { return location_.file.c_str(); }
+
+  // Returns the line where this test is defined.
+  int line() const { return location_.line; }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
   //
   // Google Test allows the user to filter the tests by their full names.
   // The full name of a test Bar in test case Foo is defined as
@@ -668,21 +693,31 @@ class GTEST_API_ TestInfo {
   // contains the character 'A' or starts with "Foo.".
   bool should_run() const { return should_run_; }
 
+  // Returns true iff this test will appear in the XML report.
+  bool is_reportable() const {
+    // For now, the XML report includes all tests matching the filter.
+    // In the future, we may trim tests that are excluded because of
+    // sharding.
+    return matches_filter_;
+  }
+
   // Returns the result of the test.
   const TestResult* result() const { return &result_; }
 
  private:
-
 #if GTEST_HAS_DEATH_TEST
   friend class internal::DefaultDeathTestFactory;
 #endif  // GTEST_HAS_DEATH_TEST
   friend class Test;
   friend class TestCase;
   friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
   friend TestInfo* internal::MakeAndRegisterTestInfo(
-      const char* test_case_name, const char* name,
+      const char* test_case_name,
+      const char* name,
       const char* type_param,
       const char* value_param,
+      internal::CodeLocation code_location,
       internal::TypeId fixture_class_id,
       Test::SetUpTestCaseFunc set_up_tc,
       Test::TearDownTestCaseFunc tear_down_tc,
@@ -690,9 +725,11 @@ class GTEST_API_ TestInfo {
 
   // Constructs a TestInfo object. The newly constructed instance assumes
   // ownership of the factory object.
-  TestInfo(const char* test_case_name, const char* name,
-           const char* a_type_param,
-           const char* a_value_param,
+  TestInfo(const std::string& test_case_name,
+           const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::CodeLocation a_code_location,
            internal::TypeId fixture_class_id,
            internal::TestFactoryBase* factory);
 
@@ -719,6 +756,7 @@ class GTEST_API_ TestInfo {
   // Text representation of the value parameter, or NULL if this is not a
   // value-parameterized test.
   const internal::scoped_ptr<const ::std::string> value_param_;
+  internal::CodeLocation location_;
   const internal::TypeId fixture_class_id_;   // ID of the test fixture class
   bool should_run_;                 // True iff this test should run
   bool is_disabled_;                // True iff this test is disabled
@@ -778,9 +816,15 @@ class GTEST_API_ TestCase {
   // Gets the number of failed tests in this test case.
   int failed_test_count() const;
 
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
   // Gets the number of disabled tests in this test case.
   int disabled_test_count() const;
 
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
   // Get the number of tests in this test case that should run.
   int test_to_run_count() const;
 
@@ -800,6 +844,10 @@ class GTEST_API_ TestCase {
   // total_test_count() - 1. If i is not in that range, returns NULL.
   const TestInfo* GetTestInfo(int i) const;
 
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestCase and TearDownTestCase.
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
  private:
   friend class Test;
   friend class internal::UnitTestImpl;
@@ -852,11 +900,22 @@ class GTEST_API_ TestCase {
     return test_info->should_run() && test_info->result()->Failed();
   }
 
+  // Returns true iff the test is disabled and will be reported in the XML
+  // report.
+  static bool TestReportableDisabled(const TestInfo* test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
   // Returns true iff test is disabled.
   static bool TestDisabled(const TestInfo* test_info) {
     return test_info->is_disabled_;
   }
 
+  // Returns true iff this test will appear in the XML report.
+  static bool TestReportable(const TestInfo* test_info) {
+    return test_info->is_reportable();
+  }
+
   // Returns true if the given test should run.
   static bool ShouldRunTest(const TestInfo* test_info) {
     return test_info->should_run();
@@ -869,7 +928,7 @@ class GTEST_API_ TestCase {
   void UnshuffleTests();
 
   // Name of the test case.
-  internal::String name_;
+  std::string name_;
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
   const internal::scoped_ptr<const ::std::string> type_param_;
@@ -888,13 +947,16 @@ class GTEST_API_ TestCase {
   bool should_run_;
   // Elapsed time, in milliseconds.
   TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestCase and
+  // TearDownTestCase.
+  TestResult ad_hoc_test_result_;
 
   // We disallow copying TestCases.
   GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
 };
 
 // An Environment object is capable of setting up and tearing down an
-// environment.  The user should subclass this to define his own
+// environment.  You should subclass this to define your own
 // environment(s).
 //
 // An Environment object does the set-up and tear-down in virtual
@@ -910,7 +972,7 @@ class GTEST_API_ TestCase {
 class Environment {
  public:
   // The d'tor is virtual as we need to subclass Environment.
-  virtual ~Environment();
+  virtual ~Environment() {}
 
   // Override this to define how to set up the environment.
   virtual void SetUp() {}
@@ -928,7 +990,7 @@ class Environment {
 // the order the corresponding events are fired.
 class TestEventListener {
  public:
-  virtual ~TestEventListener();
+  virtual ~TestEventListener() {}
 
   // Fired before any test activity starts.
   virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
@@ -980,24 +1042,22 @@ class TestEventListener {
 // comments about each method please see the definition of TestEventListener
 // above.
 class EmptyTestEventListener : public TestEventListener {
-  virtual void anchor();
  public:
-   void OnTestProgramStart(const UnitTest & /*unit_test*/) override {}
-   void OnTestIterationStart(const UnitTest & /*unit_test*/,
-                             int /*iteration*/) override {}
-   void OnEnvironmentsSetUpStart(const UnitTest & /*unit_test*/) override {}
-   void OnEnvironmentsSetUpEnd(const UnitTest & /*unit_test*/) override {}
-   void OnTestCaseStart(const TestCase & /*test_case*/) override {}
-   void OnTestStart(const TestInfo & /*test_info*/) override {}
-   void OnTestPartResult(const TestPartResult & /*test_part_result*/) override {
-   }
-   void OnTestEnd(const TestInfo & /*test_info*/) override {}
-   void OnTestCaseEnd(const TestCase & /*test_case*/) override {}
-   void OnEnvironmentsTearDownStart(const UnitTest & /*unit_test*/) override {}
-   void OnEnvironmentsTearDownEnd(const UnitTest & /*unit_test*/) override {}
-   void OnTestIterationEnd(const UnitTest & /*unit_test*/,
-                           int /*iteration*/) override {}
-   void OnTestProgramEnd(const UnitTest & /*unit_test*/) override {}
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                                    int /*iteration*/) {}
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
+  virtual void OnTestStart(const TestInfo& /*test_info*/) {}
+  virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {}
+  virtual void OnTestEnd(const TestInfo& /*test_info*/) {}
+  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                                  int /*iteration*/) {}
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
 };
 
 // TestEventListeners lets users add listeners to track events in Google Test.
@@ -1109,11 +1169,13 @@ class GTEST_API_ UnitTest {
 
   // Returns the TestCase object for the test that's currently running,
   // or NULL if no test is running.
-  const TestCase* current_test_case() const;
+  const TestCase* current_test_case() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Returns the TestInfo object for the test that's currently running,
   // or NULL if no test is running.
-  const TestInfo* current_test_info() const;
+  const TestInfo* current_test_info() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
@@ -1123,7 +1185,8 @@ class GTEST_API_ UnitTest {
   // value-parameterized tests and instantiate and register them.
   //
   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  internal::ParameterizedTestCaseRegistry& parameterized_test_registry();
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
+      GTEST_LOCK_EXCLUDED_(mutex_);
 #endif  // GTEST_HAS_PARAM_TEST
 
   // Gets the number of successful test cases.
@@ -1145,15 +1208,25 @@ class GTEST_API_ UnitTest {
   // Gets the number of failed tests.
   int failed_test_count() const;
 
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
   // Gets the number of disabled tests.
   int disabled_test_count() const;
 
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
   // Gets the number of all tests.
   int total_test_count() const;
 
   // Gets the number of tests that should run.
   int test_to_run_count() const;
 
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const;
+
   // Gets the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const;
 
@@ -1168,6 +1241,10 @@ class GTEST_API_ UnitTest {
   // total_test_case_count() - 1. If i is not in that range, returns NULL.
   const TestCase* GetTestCase(int i) const;
 
+  // Returns the TestResult containing information on test failures and
+  // properties logged outside of individual test cases.
+  const TestResult& ad_hoc_test_result() const;
+
   // Returns the list of event listeners that can be used to track events
   // inside Google Test.
   TestEventListeners& listeners();
@@ -1191,12 +1268,16 @@ class GTEST_API_ UnitTest {
   void AddTestPartResult(TestPartResult::Type result_type,
                          const char* file_name,
                          int line_number,
-                         const internal::String& message,
-                         const internal::String& os_stack_trace);
+                         const std::string& message,
+                         const std::string& os_stack_trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
 
-  // Adds a TestProperty to the current TestResult object. If the result already
-  // contains a property with the same key, the value will be updated.
-  void RecordPropertyForCurrentTest(const char* key, const char* value);
+  // Adds a TestProperty to the current TestResult object when invoked from
+  // inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+  // from SetUpTestCase or TearDownTestCase, or to the global property set
+  // when invoked elsewhere.  If the result already contains a property with
+  // the same key, the value will be updated.
+  void RecordProperty(const std::string& key, const std::string& value);
 
   // Gets the i-th test case among all the test cases. i can range from 0 to
   // total_test_case_count() - 1. If i is not in that range, returns NULL.
@@ -1211,11 +1292,13 @@ class GTEST_API_ UnitTest {
   friend class Test;
   friend class internal::AssertHelper;
   friend class internal::ScopedTrace;
+  friend class internal::StreamingListenerTest;
+  friend class internal::UnitTestRecordPropertyTestHelper;
   friend Environment* AddGlobalTestEnvironment(Environment* env);
   friend internal::UnitTestImpl* internal::GetUnitTestImpl();
   friend void internal::ReportFailureInUnknownLocation(
       TestPartResult::Type result_type,
-      const internal::String& message);
+      const std::string& message);
 
   // Creates an empty UnitTest.
   UnitTest();
@@ -1225,10 +1308,12 @@ class GTEST_API_ UnitTest {
 
   // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
   // Google Test trace stack.
-  void PushGTestTrace(const internal::TraceInfo& trace);
+  void PushGTestTrace(const internal::TraceInfo& trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Pops a trace from the per-thread Google Test trace stack.
-  void PopGTestTrace();
+  void PopGTestTrace()
+      GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Protects mutable state in *impl_.  This is mutable as some const
   // methods need to lock it too.
@@ -1283,62 +1368,42 @@ GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
 
 namespace internal {
 
-// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
-// operand to be used in a failure message.  The type (but not value)
-// of the other operand may affect the format.  This allows us to
-// print a char* as a raw pointer when it is compared against another
-// char*, and print it as a C string when it is compared against an
-// std::string object, for example.
-//
-// The default implementation ignores the type of the other operand.
-// Some specialized versions are used to handle formatting wide or
-// narrow C strings.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
 template <typename T1, typename T2>
-String FormatForComparisonFailureMessage(const T1& value,
-                                         const T2& /* other_operand */) {
-  // C++Builder compiles this incorrectly if the namespace isn't explicitly
-  // given.
-  return ::testing::PrintToString(value);
+AssertionResult CmpHelperEQFailure(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const T1& lhs, const T2& rhs) {
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
 }
 
 // The helper function for {ASSERT|EXPECT}_EQ.
 template <typename T1, typename T2>
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            const T1& expected,
-                            const T2& actual) {
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4389)  // Temporarily disables warning on
-                               // signed/unsigned mismatch.
-#pragma warning(disable:4805)  // Temporarily disables warning on
-                               // unsafe mix of types
-#endif
-
-  if (expected == actual) {
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            const T1& lhs,
+                            const T2& rhs) {
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4389 /* signed/unsigned mismatch */)
+  if (lhs == rhs) {
     return AssertionSuccess();
   }
+GTEST_DISABLE_MSC_WARNINGS_POP_()
 
-#ifdef _MSC_VER
-# pragma warning(pop)          // Restores the warning state.
-#endif
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
-                   false);
+  return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
 }
 
 // With this overloaded version, we allow anonymous enums to be used
 // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
 // can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression,
-                                       const char* actual_expression,
-                                       BiggestInt expected,
-                                       BiggestInt actual);
+GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
+                                       const char* rhs_expression,
+                                       BiggestInt lhs,
+                                       BiggestInt rhs);
 
 // The helper class for {ASSERT|EXPECT}_EQ.  The template argument
 // lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
@@ -1349,12 +1414,11 @@ class EqHelper {
  public:
   // This templatized version is for the general case.
   template <typename T1, typename T2>
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 const T1& expected,
-                                 const T2& actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 const T1& lhs,
+                                 const T2& rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
   // With this overloaded version, we allow anonymous enums to be used
@@ -1363,12 +1427,11 @@ class EqHelper {
   //
   // Even though its body looks the same as the above version, we
   // cannot merge the two, as it will make anonymous enums unhappy.
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 BiggestInt expected,
-                                 BiggestInt actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 BiggestInt lhs,
+                                 BiggestInt rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 };
 
@@ -1383,40 +1446,52 @@ class EqHelper<true> {
   // EXPECT_EQ(false, a_bool).
   template <typename T1, typename T2>
   static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
-      const T1& expected,
-      const T2& actual,
+      const char* lhs_expression,
+      const char* rhs_expression,
+      const T1& lhs,
+      const T2& rhs,
       // The following line prevents this overload from being considered if T2
       // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
       // expands to Compare("", "", NULL, my_ptr), which requires a conversion
       // to match the Secret* in the other overload, which would otherwise make
       // this template match better.
       typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
   // This version will be picked when the second argument to ASSERT_EQ() is a
   // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
   template <typename T>
   static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
+      const char* lhs_expression,
+      const char* rhs_expression,
       // We used to have a second template parameter instead of Secret*.  That
       // template parameter would deduce to 'long', making this a better match
       // than the first overload even without the first overload's EnableIf.
       // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
       // non-pointer argument" (even a deduced integral argument), so the old
       // implementation caused warnings in user code.
-      Secret* /* expected (NULL) */,
-      T* actual) {
-    // We already know that 'expected' is a null pointer.
-    return CmpHelperEQ(expected_expression, actual_expression,
-                       static_cast<T*>(NULL), actual);
+      Secret* /* lhs (NULL) */,
+      T* rhs) {
+    // We already know that 'lhs' is a null pointer.
+    return CmpHelperEQ(lhs_expression, rhs_expression,
+                       static_cast<T*>(NULL), rhs);
   }
 };
 
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_OP in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
+                                   const T1& val1, const T2& val2,
+                                   const char* op) {
+  return AssertionFailure()
+         << "Expected: (" << expr1 << ") " << op << " (" << expr2
+         << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
+         << " vs " << FormatForComparisonFailureMessage(val2, val1);
+}
+
 // A macro for implementing the helper functions needed to implement
 // ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
 // of similar code.
@@ -1427,6 +1502,7 @@ class EqHelper<true> {
 // with gcc 4.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
 #define GTEST_IMPL_CMP_HELPER_(op_name, op)\
 template <typename T1, typename T2>\
 AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
@@ -1434,10 +1510,7 @@ AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
   if (val1 op val2) {\
     return AssertionSuccess();\
   } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
   }\
 }\
 GTEST_API_ AssertionResult CmpHelper##op_name(\
@@ -1450,29 +1523,29 @@ GTEST_IMPL_CMP_HELPER_(NE, !=);
 // Implements the helper function for {ASSERT|EXPECT}_LE
 GTEST_IMPL_CMP_HELPER_(LE, <=);
 // Implements the helper function for {ASSERT|EXPECT}_LT
-GTEST_IMPL_CMP_HELPER_(LT, < );
+GTEST_IMPL_CMP_HELPER_(LT, <);
 // Implements the helper function for {ASSERT|EXPECT}_GE
 GTEST_IMPL_CMP_HELPER_(GE, >=);
 // Implements the helper function for {ASSERT|EXPECT}_GT
-GTEST_IMPL_CMP_HELPER_(GT, > );
+GTEST_IMPL_CMP_HELPER_(GT, >);
 
 #undef GTEST_IMPL_CMP_HELPER_
 
 // The helper function for {ASSERT|EXPECT}_STREQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const char* expected,
-                                          const char* actual);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                              const char* actual_expression,
-                                              const char* expected,
-                                              const char* actual);
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
@@ -1494,10 +1567,10 @@ GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const wchar_t* expected,
-                                          const wchar_t* actual);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
 
 // Helper function for *_STRNE on wide strings.
 //
@@ -1555,28 +1628,28 @@ namespace internal {
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename RawType>
-AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
-                                         const char* actual_expression,
-                                         RawType expected,
-                                         RawType actual) {
-  const FloatingPoint<RawType> lhs(expected), rhs(actual);
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value,
+                                         RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
     return AssertionSuccess();
   }
 
-  ::std::stringstream expected_ss;
-  expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-              << expected;
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
 
-  ::std::stringstream actual_ss;
-  actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-            << actual;
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   StringStreamToString(&expected_ss),
-                   StringStreamToString(&actual_ss),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   StringStreamToString(&lhs_ss),
+                   StringStreamToString(&rhs_ss),
                    false);
 }
 
@@ -1618,9 +1691,9 @@ class GTEST_API_ AssertHelper {
         : type(t), file(srcfile), line(line_num), message(msg) { }
 
     TestPartResult::Type const type;
-    const char*        const file;
-    int                const line;
-    String             const message;
+    const char* const file;
+    int const line;
+    std::string const message;
 
    private:
     GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
@@ -1679,7 +1752,12 @@ class WithParamInterface {
   // references static data, to reduce the opportunity for incorrect uses
   // like writing 'WithParamInterface<bool>::GetParam()' for a test that
   // uses a fixture whose parameter type is int.
-  const ParamType& GetParam() const { return *parameter_; }
+  const ParamType& GetParam() const {
+    GTEST_CHECK_(parameter_ != NULL)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
 
  private:
   // Sets parameter value. The caller is responsible for making sure the value
@@ -1725,12 +1803,6 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // usually want the fail-fast behavior of FAIL and ASSERT_*, but those
 // writing data-driven tests often find themselves using ADD_FAILURE
 // and EXPECT_* more.
-//
-// Examples:
-//
-//   EXPECT_TRUE(server.StatusIsOK());
-//   ASSERT_FALSE(server.HasPendingRequest(port))
-//       << "There are still pending requests " << "on port " << port;
 
 // Generates a nonfatal failure with a generic message.
 #define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
@@ -1785,13 +1857,13 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
 #define EXPECT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+  GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
 #define EXPECT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
 #define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+  GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \
                       GTEST_FATAL_FAILURE_)
 #define ASSERT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
@@ -1803,12 +1875,12 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 
 // Macros for testing equalities and inequalities.
 //
-//    * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual
-//    * {ASSERT|EXPECT}_NE(v1, v2):           Tests that v1 != v2
-//    * {ASSERT|EXPECT}_LT(v1, v2):           Tests that v1 < v2
-//    * {ASSERT|EXPECT}_LE(v1, v2):           Tests that v1 <= v2
-//    * {ASSERT|EXPECT}_GT(v1, v2):           Tests that v1 > v2
-//    * {ASSERT|EXPECT}_GE(v1, v2):           Tests that v1 >= v2
+//    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
+//    * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
 //
 // When they are not, Google Test prints both the tested expressions and
 // their actual values.  The values must be compatible built-in types,
@@ -1830,8 +1902,8 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 //   are related, not how their content is related.  To compare two C
 //   strings by content, use {ASSERT|EXPECT}_STR*().
 //
-//   3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to
-//   {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you
+//   3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
+//   {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
 //   what the actual value is when it fails, and similarly for the
 //   other comparisons.
 //
@@ -1847,12 +1919,12 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 //   ASSERT_LT(i, array_size);
 //   ASSERT_GT(records.size(), 0) << "There is no record left.";
 
-#define EXPECT_EQ(expected, actual) \
+#define EXPECT_EQ(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
-#define EXPECT_NE(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual)
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
+#define EXPECT_NE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define EXPECT_LE(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
 #define EXPECT_LT(val1, val2) \
@@ -1862,10 +1934,10 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 #define EXPECT_GT(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
 
-#define GTEST_ASSERT_EQ(expected, actual) \
+#define GTEST_ASSERT_EQ(val1, val2) \
   ASSERT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
 #define GTEST_ASSERT_NE(val1, val2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define GTEST_ASSERT_LE(val1, val2) \
@@ -1904,7 +1976,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 # define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
 #endif
 
-// C String Comparisons.  All tests treat NULL and any non-NULL string
+// C-string Comparisons.  All tests treat NULL and any non-NULL string
 // as different.  Two NULLs are equal.
 //
 //    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
@@ -1920,29 +1992,29 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 //
 // These macros evaluate their arguments exactly once.
 
-#define EXPECT_STREQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define EXPECT_STREQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
 #define EXPECT_STRNE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define EXPECT_STRCASEEQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define EXPECT_STRCASEEQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
 #define EXPECT_STRCASENE(s1, s2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
-#define ASSERT_STREQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define ASSERT_STREQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
 #define ASSERT_STRNE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define ASSERT_STRCASEEQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define ASSERT_STRCASEEQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
 #define ASSERT_STRCASENE(s1, s2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
 //
-//    * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual):
+//    * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
 //         Tests that two float values are almost equal.
-//    * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual):
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
 //         Tests that two double values are almost equal.
 //    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
 //         Tests that v1 and v2 are within the given distance to each other.
@@ -1952,21 +2024,21 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(expected, actual)\
+#define EXPECT_FLOAT_EQ(val1, val2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
+                      val1, val2)
 
-#define EXPECT_DOUBLE_EQ(expected, actual)\
+#define EXPECT_DOUBLE_EQ(val1, val2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
+                      val1, val2)
 
-#define ASSERT_FLOAT_EQ(expected, actual)\
+#define ASSERT_FLOAT_EQ(val1, val2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
+                      val1, val2)
 
-#define ASSERT_DOUBLE_EQ(expected, actual)\
+#define ASSERT_DOUBLE_EQ(val1, val2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
+                      val1, val2)
 
 #define EXPECT_NEAR(val1, val2, abs_error)\
   EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
@@ -2088,8 +2160,8 @@ bool StaticAssertTypeEq() {
 // The convention is to end the test case name with "Test".  For
 // example, a test case for the Foo class can be named FooTest.
 //
-// The user should put his test code between braces after using this
-// macro.  Example:
+// Test code should appear between braces after an invocation of
+// this macro.  Example:
 //
 //   TEST(FooTest, InitializesCorrectly) {
 //     Foo foo;
@@ -2145,15 +2217,20 @@ bool StaticAssertTypeEq() {
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
 
-// Use this macro in main() to run all tests.  It returns 0 if all
+}  // namespace testing
+
+// Use this function in main() to run all tests.  It returns 0 if all
 // tests are successful, or 1 otherwise.
 //
 // RUN_ALL_TESTS() should be invoked after the command line has been
 // parsed by InitGoogleTest().
+//
+// This function was formerly a macro; thus, it is in the global
+// namespace and has an all-caps name.
+int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
 
-#define RUN_ALL_TESTS()\
-  (::testing::UnitTest::GetInstance()->Run())
-
-}  // namespace testing
+inline int RUN_ALL_TESTS() {
+  return ::testing::UnitTest::GetInstance()->Run();
+}
 
 #endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/utils/unittest/googletest/include/gtest/gtest_pred_impl.h b/utils/unittest/googletest/include/gtest/gtest_pred_impl.h
index 3805f85bdb3..30ae712f50e 100644
--- a/utils/unittest/googletest/include/gtest/gtest_pred_impl.h
+++ b/utils/unittest/googletest/include/gtest/gtest_pred_impl.h
@@ -27,7 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// This file is AUTOMATICALLY GENERATED on 09/24/2010 by command
+// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
 // 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
 //
 // Implements a family of generic predicate assertion macros.
@@ -98,7 +98,7 @@ AssertionResult AssertPred1Helper(const char* pred_text,
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
 // Don't use this in your code.
 #define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1),\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
@@ -144,7 +144,7 @@ AssertionResult AssertPred2Helper(const char* pred_text,
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
 // Don't use this in your code.
 #define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2),\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
@@ -197,7 +197,7 @@ AssertionResult AssertPred3Helper(const char* pred_text,
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
 // Don't use this in your code.
 #define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3),\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
@@ -257,7 +257,7 @@ AssertionResult AssertPred4Helper(const char* pred_text,
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
 // Don't use this in your code.
 #define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4),\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
@@ -324,7 +324,7 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
 // Don't use this in your code.
 #define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5),\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
diff --git a/utils/unittest/googletest/include/gtest/internal/custom/gtest-port.h b/utils/unittest/googletest/include/gtest/internal/custom/gtest-port.h
new file mode 100644
index 00000000000..7e744bd3bb3
--- /dev/null
+++ b/utils/unittest/googletest/include/gtest/internal/custom/gtest-port.h
@@ -0,0 +1,69 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations.
+// The following macros can be defined:
+//
+//   Flag related macros:
+//     GTEST_FLAG(flag_name)
+//     GTEST_USE_OWN_FLAGFILE_FLAG_  - Define to 0 when the system provides its
+//                                     own flagfile flag parsing.
+//     GTEST_DECLARE_bool_(name)
+//     GTEST_DECLARE_int32_(name)
+//     GTEST_DECLARE_string_(name)
+//     GTEST_DEFINE_bool_(name, default_val, doc)
+//     GTEST_DEFINE_int32_(name, default_val, doc)
+//     GTEST_DEFINE_string_(name, default_val, doc)
+//
+//   Test filtering:
+//     GTEST_TEST_FILTER_ENV_VAR_ - The name of an environment variable that
+//                                  will be used if --GTEST_FLAG(test_filter)
+//                                  is not provided.
+//
+//   Logging:
+//     GTEST_LOG_(severity)
+//     GTEST_CHECK_(condition)
+//     Functions LogToStderr() and FlushInfoLog() have to be provided too.
+//
+//   Threading:
+//     GTEST_HAS_NOTIFICATION_ - Enabled if Notification is already provided.
+//     GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ - Enabled if Mutex and ThreadLocal are
+//                                         already provided.
+//     Must also provide GTEST_DECLARE_STATIC_MUTEX_(mutex) and
+//     GTEST_DEFINE_STATIC_MUTEX_(mutex)
+//
+//     GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+//     GTEST_LOCK_EXCLUDED_(locks)
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/utils/unittest/googletest/include/gtest/internal/custom/gtest-printers.h b/utils/unittest/googletest/include/gtest/internal/custom/gtest-printers.h
new file mode 100644
index 00000000000..60c1ea050b6
--- /dev/null
+++ b/utils/unittest/googletest/include/gtest/internal/custom/gtest-printers.h
@@ -0,0 +1,42 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file provides an injection point for custom printers in a local
+// installation of gTest.
+// It will be included from gtest-printers.h and the overrides in this file
+// will be visible to everyone.
+// See documentation at gtest/gtest-printers.h for details on how to define a
+// custom printer.
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/utils/unittest/googletest/include/gtest/internal/custom/gtest.h b/utils/unittest/googletest/include/gtest/internal/custom/gtest.h
new file mode 100644
index 00000000000..c27412a8981
--- /dev/null
+++ b/utils/unittest/googletest/include/gtest/internal/custom/gtest.h
@@ -0,0 +1,41 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations.
+// The following macros can be defined:
+//
+// GTEST_OS_STACK_TRACE_GETTER_  - The name of an implementation of
+//                                 OsStackTraceGetterInterface.
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-death-test-internal.h b/utils/unittest/googletest/include/gtest/internal/gtest-death-test-internal.h
index 04c676ce5ff..01b5d5ebc90 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-death-test-internal.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-death-test-internal.h
@@ -79,7 +79,7 @@ class GTEST_API_ DeathTest {
   static bool Create(const char* statement, const RE* regex,
                      const char* file, int line, DeathTest** test);
   DeathTest();
-  virtual ~DeathTest();
+  virtual ~DeathTest() { }
 
   // A helper class that aborts a death test when it's deleted.
   class ReturnSentinel {
@@ -127,11 +127,11 @@ class GTEST_API_ DeathTest {
   // the last death test.
   static const char* LastMessage();
 
-  static void set_last_death_test_message(const String& message);
+  static void set_last_death_test_message(const std::string& message);
 
  private:
   // A string containing a description of the outcome of the last death test.
-  static String last_death_test_message_;
+  static std::string last_death_test_message_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
 };
@@ -139,7 +139,7 @@ class GTEST_API_ DeathTest {
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
-  virtual ~DeathTestFactory();
+  virtual ~DeathTestFactory() { }
   virtual bool Create(const char* statement, const RE* regex,
                       const char* file, int line, DeathTest** test) = 0;
 };
@@ -147,8 +147,8 @@ class DeathTestFactory {
 // A concrete DeathTestFactory implementation for normal use.
 class DefaultDeathTestFactory : public DeathTestFactory {
  public:
-   bool Create(const char *statement, const RE *regex, const char *file,
-               int line, DeathTest **test) override;
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test);
 };
 
 // Returns true if exit_status describes a process that was terminated
@@ -215,12 +215,23 @@ GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
 // The symbol "fail" here expands to something into which a message
 // can be streamed.
 
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed, the regex is
+// ignored, and the macro must accept a streamed message even though the message
+// is never printed.
+# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else \
+    ::testing::Message()
+
 // A class representing the parsed contents of the
 // --gtest_internal_run_death_test flag, as it existed when
 // RUN_ALL_TESTS was called.
 class InternalRunDeathTestFlag {
  public:
-  InternalRunDeathTestFlag(const String& a_file,
+  InternalRunDeathTestFlag(const std::string& a_file,
                            int a_line,
                            int an_index,
                            int a_write_fd)
@@ -232,13 +243,13 @@ class InternalRunDeathTestFlag {
       posix::Close(write_fd_);
   }
 
-  String file() const { return file_; }
+  const std::string& file() const { return file_; }
   int line() const { return line_; }
   int index() const { return index_; }
   int write_fd() const { return write_fd_; }
 
  private:
-  String file_;
+  std::string file_;
   int line_;
   int index_;
   int write_fd_;
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-filepath.h b/utils/unittest/googletest/include/gtest/internal/gtest-filepath.h
index 823c6bdc258..7a13b4b0de6 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-filepath.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-filepath.h
@@ -61,11 +61,7 @@ class GTEST_API_ FilePath {
   FilePath() : pathname_("") { }
   FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
 
-  explicit FilePath(const char* pathname) : pathname_(pathname) {
-    Normalize();
-  }
-
-  explicit FilePath(const String& pathname) : pathname_(pathname) {
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
     Normalize();
   }
 
@@ -78,7 +74,7 @@ class GTEST_API_ FilePath {
     pathname_ = rhs.pathname_;
   }
 
-  String ToString() const { return pathname_; }
+  const std::string& string() const { return pathname_; }
   const char* c_str() const { return pathname_.c_str(); }
 
   // Returns the current working directory, or "" if unsuccessful.
@@ -111,8 +107,8 @@ class GTEST_API_ FilePath {
                                          const FilePath& base_name,
                                          const char* extension);
 
-  // Returns true iff the path is NULL or "".
-  bool IsEmpty() const { return c_str() == NULL || *c_str() == '\0'; }
+  // Returns true iff the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
 
   // If input name has a trailing separator character, removes it and returns
   // the name, otherwise return the name string unmodified.
@@ -196,12 +192,12 @@ class GTEST_API_ FilePath {
 
   void Normalize();
 
-  // Returns a pointer to the last occurrence of a valid path separator in
+  // Returns a pointer to the last occurence of a valid path separator in
   // the FilePath. On Windows, for example, both '/' and '\' are valid path
   // separators. Returns NULL if no path separator was found.
   const char* FindLastPathSeparator() const;
 
-  String pathname_;
+  std::string pathname_;
 };  // class FilePath
 
 }  // namespace internal
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-internal.h b/utils/unittest/googletest/include/gtest/internal/gtest-internal.h
index 3c7eee81b1d..ebd1cf615de 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-internal.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-internal.h
@@ -46,20 +46,25 @@
 # include <unistd.h>
 #endif  // GTEST_OS_LINUX
 
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
 #include <ctype.h>
+#include <float.h>
 #include <string.h>
 #include <iomanip>
 #include <limits>
+#include <map>
 #include <set>
+#include <string>
+#include <vector>
 
+#include "gtest/gtest-message.h"
 #include "gtest/internal/gtest-string.h"
 #include "gtest/internal/gtest-filepath.h"
 #include "gtest/internal/gtest-type-util.h"
 
-#if !GTEST_NO_LLVM_RAW_OSTREAM
-#include "llvm/Support/raw_os_ostream.h"
-#endif
-
 // Due to C++ preprocessor weirdness, we need double indirection to
 // concatenate two tokens when one of them is __LINE__.  Writing
 //
@@ -71,61 +76,6 @@
 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
 #define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
 
-// Google Test defines the testing::Message class to allow construction of
-// test messages via the << operator.  The idea is that anything
-// streamable to std::ostream can be streamed to a testing::Message.
-// This allows a user to use his own types in Google Test assertions by
-// overloading the << operator.
-//
-// util/gtl/stl_logging-inl.h overloads << for STL containers.  These
-// overloads cannot be defined in the std namespace, as that will be
-// undefined behavior.  Therefore, they are defined in the global
-// namespace instead.
-//
-// C++'s symbol lookup rule (i.e. Koenig lookup) says that these
-// overloads are visible in either the std namespace or the global
-// namespace, but not other namespaces, including the testing
-// namespace which Google Test's Message class is in.
-//
-// To allow STL containers (and other types that has a << operator
-// defined in the global namespace) to be used in Google Test assertions,
-// testing::Message must access the custom << operator from the global
-// namespace.  Hence this helper function.
-//
-// Note: Jeffrey Yasskin suggested an alternative fix by "using
-// ::operator<<;" in the definition of Message's operator<<.  That fix
-// doesn't require a helper function, but unfortunately doesn't
-// compile with MSVC.
-
-// LLVM INTERNAL CHANGE: To allow operator<< to work with both
-// std::ostreams and LLVM's raw_ostreams, we define a special
-// std::ostream with an implicit conversion to raw_ostream& and stream
-// to that.  This causes the compiler to prefer std::ostream overloads
-// but still find raw_ostream& overloads.
-#if !GTEST_NO_LLVM_RAW_OSTREAM
-namespace llvm {
-class convertible_fwd_ostream : public std::ostream {
-  virtual void anchor();
-  raw_os_ostream ros_;
-
-public:
-  convertible_fwd_ostream(std::ostream& os)
-    : std::ostream(os.rdbuf()), ros_(*this) {}
-  operator raw_ostream&() { return ros_; }
-};
-}
-template <typename T>
-inline void GTestStreamToHelper(std::ostream* os, const T& val) {
-  llvm::convertible_fwd_ostream cos(*os);
-  cos << val;
-}
-#else
-template <typename T>
-inline void GTestStreamToHelper(std::ostream* os, const T& val) {
-  *os << val;
-}
-#endif
-
 class ProtocolMessage;
 namespace proto2 { class Message; }
 
@@ -150,18 +100,10 @@ class ScopedTrace;                     // Implements scoped trace.
 class TestInfoImpl;                    // Opaque implementation of TestInfo
 class UnitTestImpl;                    // Opaque implementation of UnitTest
 
-// How many times InitGoogleTest() has been called.
-extern int g_init_gtest_count;
-
 // The text used in failure messages to indicate the start of the
 // stack trace.
 GTEST_API_ extern const char kStackTraceMarker[];
 
-// A secret type that Google Test users don't know about.  It has no
-// definition on purpose.  Therefore it's impossible to create a
-// Secret object, which is what we want.
-class Secret;
-
 // Two overloaded helpers for checking at compile time whether an
 // expression is a null pointer literal (i.e. NULL or any 0-valued
 // compile-time integral constant).  Their return values have
@@ -192,8 +134,23 @@ char (&IsNullLiteralHelper(...))[2];  // NOLINT
 #endif  // GTEST_ELLIPSIS_NEEDS_POD_
 
 // Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ String AppendUserMessage(const String& gtest_msg,
-                                    const Message& user_msg);
+GTEST_API_ std::string AppendUserMessage(
+    const std::string& gtest_msg, const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
 
 // A helper class for creating scoped traces in user programs.
 class GTEST_API_ ScopedTrace {
@@ -214,76 +171,35 @@ class GTEST_API_ ScopedTrace {
                             // c'tor and d'tor.  Therefore it doesn't
                             // need to be used otherwise.
 
-// Converts a streamable value to a String.  A NULL pointer is
-// converted to "(null)".  When the input value is a ::string,
-// ::std::string, ::wstring, or ::std::wstring object, each NUL
-// character in it is replaced with "\\0".
-// Declared here but defined in gtest.h, so that it has access
-// to the definition of the Message class, required by the ARM
-// compiler.
-template <typename T>
-String StreamableToString(const T& streamable);
-
-// The Symbian compiler has a bug that prevents it from selecting the
-// correct overload of FormatForComparisonFailureMessage (see below)
-// unless we pass the first argument by reference.  If we do that,
-// however, Visual Age C++ 10.1 generates a compiler error.  Therefore
-// we only apply the work-around for Symbian.
-#if defined(__SYMBIAN32__)
-# define GTEST_CREF_WORKAROUND_ const&
-#else
-# define GTEST_CREF_WORKAROUND_
-#endif
-
-// When this operand is a const char* or char*, if the other operand
-// is a ::std::string or ::string, we print this operand as a C string
-// rather than a pointer (we do the same for wide strings); otherwise
-// we print it as a pointer to be safe.
-
-// This internal macro is used to avoid duplicated code.
-#define GTEST_FORMAT_IMPL_(operand2_type, operand1_printer)\
-inline String FormatForComparisonFailureMessage(\
-    operand2_type::value_type* GTEST_CREF_WORKAROUND_ str, \
-    const operand2_type& /*operand2*/) {\
-  return operand1_printer(str);\
-}\
-inline String FormatForComparisonFailureMessage(\
-    const operand2_type::value_type* GTEST_CREF_WORKAROUND_ str, \
-    const operand2_type& /*operand2*/) {\
-  return operand1_printer(str);\
-}
-
-GTEST_FORMAT_IMPL_(::std::string, String::ShowCStringQuoted)
-#if GTEST_HAS_STD_WSTRING
-GTEST_FORMAT_IMPL_(::std::wstring, String::ShowWideCStringQuoted)
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_FORMAT_IMPL_(::string, String::ShowCStringQuoted)
-#endif  // GTEST_HAS_GLOBAL_STRING
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_FORMAT_IMPL_(::wstring, String::ShowWideCStringQuoted)
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-#undef GTEST_FORMAT_IMPL_
-
-// The next four overloads handle the case where the operand being
-// printed is a char/wchar_t pointer and the other operand is not a
-// string/wstring object.  In such cases, we just print the operand as
-// a pointer to be safe.
-#define GTEST_FORMAT_CHAR_PTR_IMPL_(CharType)                       \
-  template <typename T>                                             \
-  String FormatForComparisonFailureMessage(CharType* GTEST_CREF_WORKAROUND_ p, \
-                                           const T&) { \
-    return PrintToString(static_cast<const void*>(p));              \
-  }
-
-GTEST_FORMAT_CHAR_PTR_IMPL_(char)
-GTEST_FORMAT_CHAR_PTR_IMPL_(const char)
-GTEST_FORMAT_CHAR_PTR_IMPL_(wchar_t)
-GTEST_FORMAT_CHAR_PTR_IMPL_(const wchar_t)
-
-#undef GTEST_FORMAT_CHAR_PTR_IMPL_
+namespace edit_distance {
+// Returns the optimal edits to go from 'left' to 'right'.
+// All edits cost the same, with replace having lower priority than
+// add/remove.
+// Simple implementation of the WagnerâFischer algorithm.
+// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
+enum EditType { kMatch, kAdd, kRemove, kReplace };
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<size_t>& left, const std::vector<size_t>& right);
+
+// Same as above, but the input is represented as strings.
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right);
+
+// Create a diff of the input strings in Unified diff format.
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                                         const std::vector<std::string>& right,
+                                         size_t context = 2);
+
+}  // namespace edit_distance
+
+// Calculate the diff between 'left' and 'right' and return it in unified diff
+// format.
+// If not null, stores in 'total_line_count' the total number of lines found
+// in left + right.
+GTEST_API_ std::string DiffStrings(const std::string& left,
+                                   const std::string& right,
+                                   size_t* total_line_count);
 
 // Constructs and returns the message for an equality assertion
 // (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
@@ -302,12 +218,12 @@ GTEST_FORMAT_CHAR_PTR_IMPL_(const wchar_t)
 // be inserted into the message.
 GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
                                      const char* actual_expression,
-                                     const String& expected_value,
-                                     const String& actual_value,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
                                      bool ignoring_case);
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-GTEST_API_ String GetBoolAssertionFailureMessage(
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
     const AssertionResult& assertion_result,
     const char* expression_text,
     const char* actual_predicate_value,
@@ -382,7 +298,7 @@ class FloatingPoint {
   // bits.  Therefore, 4 should be enough for ordinary use.
   //
   // See the following article for more details on ULP:
-  // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm.
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
   static const size_t kMaxUlps = 4;
 
   // Constructs a FloatingPoint from a raw floating-point number.
@@ -409,6 +325,9 @@ class FloatingPoint {
     return ReinterpretBits(kExponentBitMask);
   }
 
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
   // Non-static methods
 
   // Returns the bits that represents this number.
@@ -489,6 +408,13 @@ class FloatingPoint {
   FloatingPointUnion u_;
 };
 
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+template <>
+inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+
 // Typedefs the instances of the FloatingPoint template class that we
 // care to use.
 typedef FloatingPoint<float> Float;
@@ -537,7 +463,7 @@ GTEST_API_ TypeId GetTestTypeId();
 // of a Test object.
 class TestFactoryBase {
  public:
-  virtual ~TestFactoryBase();
+  virtual ~TestFactoryBase() {}
 
   // Creates a test instance to run. The instance is both created and destroyed
   // within TestInfoImpl::Run()
@@ -555,7 +481,7 @@ class TestFactoryBase {
 template <class TestClass>
 class TestFactoryImpl : public TestFactoryBase {
  public:
-   Test *CreateTest() override { return new TestClass; }
+  virtual Test* CreateTest() { return new TestClass; }
 };
 
 #if GTEST_OS_WINDOWS
@@ -575,6 +501,13 @@ GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
 typedef void (*SetUpTestCaseFunc)();
 typedef void (*TearDownTestCaseFunc)();
 
+struct CodeLocation {
+  CodeLocation(const string& a_file, int a_line) : file(a_file), line(a_line) {}
+
+  string file;
+  int line;
+};
+
 // Creates a new TestInfo object and registers it with Google Test;
 // returns the created object.
 //
@@ -583,9 +516,10 @@ typedef void (*TearDownTestCaseFunc)();
 //   test_case_name:   name of the test case
 //   name:             name of the test
 //   type_param        the name of the test's type parameter, or NULL if
-//                     this is not  a typed or a type-parameterized test.
+//                     this is not a typed or a type-parameterized test.
 //   value_param       text representation of the test's value parameter,
 //                     or NULL if this is not a type-parameterized test.
+//   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
 //   set_up_tc:        pointer to the function that sets up the test case
 //   tear_down_tc:     pointer to the function that tears down the test case
@@ -593,9 +527,11 @@ typedef void (*TearDownTestCaseFunc)();
 //                     The newly created TestInfo instance will assume
 //                     ownership of the factory object.
 GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name, const char* name,
+    const char* test_case_name,
+    const char* name,
     const char* type_param,
     const char* value_param,
+    CodeLocation code_location,
     TypeId fixture_class_id,
     SetUpTestCaseFunc set_up_tc,
     TearDownTestCaseFunc tear_down_tc,
@@ -625,10 +561,21 @@ class GTEST_API_ TypedTestCasePState {
       fflush(stderr);
       posix::Abort();
     }
-    defined_test_names_.insert(test_name);
+    registered_tests_.insert(
+        ::std::make_pair(test_name, CodeLocation(file, line)));
     return true;
   }
 
+  bool TestExists(const std::string& test_name) const {
+    return registered_tests_.count(test_name) > 0;
+  }
+
+  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
+    RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
+    GTEST_CHECK_(it != registered_tests_.end());
+    return it->second;
+  }
+
   // Verifies that registered_tests match the test names in
   // defined_test_names_; returns registered_tests if successful, or
   // aborts the program otherwise.
@@ -636,8 +583,10 @@ class GTEST_API_ TypedTestCasePState {
       const char* file, int line, const char* registered_tests);
 
  private:
+  typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
+
   bool registered_;
-  ::std::set<const char*> defined_test_names_;
+  RegisteredTestsMap registered_tests_;
 };
 
 // Skips to the first non-space char after the first comma in 'str';
@@ -653,11 +602,16 @@ inline const char* SkipComma(const char* str) {
 
 // Returns the prefix of 'str' before the first comma in it; returns
 // the entire string if it contains no comma.
-inline String GetPrefixUntilComma(const char* str) {
+inline std::string GetPrefixUntilComma(const char* str) {
   const char* comma = strchr(str, ',');
-  return comma == NULL ? String(str) : String(str, comma - str);
+  return comma == NULL ? str : std::string(str, comma);
 }
 
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest);
+
 // TypeParameterizedTest<Fixture, TestSel, Types>::Register()
 // registers a list of type-parameterized tests with Google Test.  The
 // return value is insignificant - we just need to return something
@@ -672,8 +626,10 @@ class TypeParameterizedTest {
   // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
   // Types).  Valid values for 'index' are [0, N - 1] where N is the
   // length of Types.
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names, int index) {
+  static bool Register(const char* prefix,
+                       CodeLocation code_location,
+                       const char* case_name, const char* test_names,
+                       int index) {
     typedef typename Types::Head Type;
     typedef Fixture<Type> FixtureClass;
     typedef typename GTEST_BIND_(TestSel, Type) TestClass;
@@ -681,11 +637,12 @@ class TypeParameterizedTest {
     // First, registers the first type-parameterized test in the type
     // list.
     MakeAndRegisterTestInfo(
-        String::Format("%s%s%s/%d", prefix, prefix[0] == '\0' ? "" : "/",
-                       case_name, index).c_str(),
-        GetPrefixUntilComma(test_names).c_str(),
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
+         + StreamableToString(index)).c_str(),
+        StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
         GetTypeName<Type>().c_str(),
         NULL,  // No value parameter.
+        code_location,
         GetTypeId<FixtureClass>(),
         TestClass::SetUpTestCase,
         TestClass::TearDownTestCase,
@@ -693,7 +650,7 @@ class TypeParameterizedTest {
 
     // Next, recurses (at compile time) with the tail of the type list.
     return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
-        ::Register(prefix, case_name, test_names, index + 1);
+        ::Register(prefix, code_location, case_name, test_names, index + 1);
   }
 };
 
@@ -701,8 +658,9 @@ class TypeParameterizedTest {
 template <GTEST_TEMPLATE_ Fixture, class TestSel>
 class TypeParameterizedTest<Fixture, TestSel, Types0> {
  public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/, int /*index*/) {
+  static bool Register(const char* /*prefix*/, CodeLocation,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       int /*index*/) {
     return true;
   }
 };
@@ -714,17 +672,31 @@ class TypeParameterizedTest<Fixture, TestSel, Types0> {
 template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
 class TypeParameterizedTestCase {
  public:
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names) {
+  static bool Register(const char* prefix, CodeLocation code_location,
+                       const TypedTestCasePState* state,
+                       const char* case_name, const char* test_names) {
+    std::string test_name = StripTrailingSpaces(
+        GetPrefixUntilComma(test_names));
+    if (!state->TestExists(test_name)) {
+      fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
+              case_name, test_name.c_str(),
+              FormatFileLocation(code_location.file.c_str(),
+                                 code_location.line).c_str());
+      fflush(stderr);
+      posix::Abort();
+    }
+    const CodeLocation& test_location = state->GetCodeLocation(test_name);
+
     typedef typename Tests::Head Head;
 
     // First, register the first test in 'Test' for each type in 'Types'.
     TypeParameterizedTest<Fixture, Head, Types>::Register(
-        prefix, case_name, test_names, 0);
+        prefix, test_location, case_name, test_names, 0);
 
     // Next, recurses (at compile time) with the tail of the test list.
     return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
-        ::Register(prefix, case_name, SkipComma(test_names));
+        ::Register(prefix, code_location, state,
+                   case_name, SkipComma(test_names));
   }
 };
 
@@ -732,15 +704,16 @@ class TypeParameterizedTestCase {
 template <GTEST_TEMPLATE_ Fixture, typename Types>
 class TypeParameterizedTestCase<Fixture, Templates0, Types> {
  public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/) {
+  static bool Register(const char* /*prefix*/, CodeLocation,
+                       const TypedTestCasePState* /*state*/,
+                       const char* /*case_name*/, const char* /*test_names*/) {
     return true;
   }
 };
 
 #endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
 
-// Returns the current OS stack trace as a String.
+// Returns the current OS stack trace as an std::string.
 //
 // The maximum number of stack frames to be included is specified by
 // the gtest_stack_trace_depth flag.  The skip_count parameter
@@ -750,8 +723,8 @@ class TypeParameterizedTestCase<Fixture, Templates0, Types> {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ String GetCurrentOsStackTraceExceptTop(UnitTest* unit_test,
-                                                  int skip_count);
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
+    UnitTest* unit_test, int skip_count);
 
 // Helpers for suppressing warnings on unreachable code or constant
 // condition.
@@ -826,13 +799,19 @@ struct RemoveConst<const T> { typedef T type; };  // NOLINT
 // MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
 // definition to fail to remove the const in 'const int[3]' and 'const
 // char[3][4]'.  The following specialization works around the bug.
-// However, it causes trouble with GCC and thus needs to be
-// conditionally compiled.
-#if defined(_MSC_VER) || defined(__SUNPRO_CC) || defined(__IBMCPP__)
 template <typename T, size_t N>
 struct RemoveConst<const T[N]> {
   typedef typename RemoveConst<T>::type type[N];
 };
+
+#if defined(_MSC_VER) && _MSC_VER < 1400
+// This is the only specialization that allows VC++ 7.1 to remove const in
+// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
+// and thus needs to be conditionally compiled.
+template <typename T, size_t N>
+struct RemoveConst<T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
 #endif
 
 // A handy wrapper around RemoveConst that works when the argument
@@ -881,7 +860,7 @@ class ImplicitlyConvertible {
   // MakeFrom() is an expression whose type is From.  We cannot simply
   // use From(), as the type From may not have a public default
   // constructor.
-  static From MakeFrom();
+  static typename AddReference<From>::type MakeFrom();
 
   // These two functions are overloaded.  Given an expression
   // Helper(x), the compiler will pick the first version if x can be
@@ -899,25 +878,20 @@ class ImplicitlyConvertible {
   // We have to put the 'public' section after the 'private' section,
   // or MSVC refuses to compile the code.
  public:
-  // MSVC warns about implicitly converting from double to int for
-  // possible loss of data, so we need to temporarily disable the
-  // warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4244)  // Temporarily disables warning 4244.
-
-  static const bool value =
-      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
-# pragma warning(pop)           // Restores the warning state.
-#elif defined(__BORLANDC__)
+#if defined(__BORLANDC__)
   // C++Builder cannot use member overload resolution during template
   // instantiation.  The simplest workaround is to use its C++0x type traits
   // functions (C++Builder 2009 and above only).
   static const bool value = __is_convertible(From, To);
 #else
+  // MSVC warns about implicitly converting from double to int for
+  // possible loss of data, so we need to temporarily disable the
+  // warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4244)
   static const bool value =
       sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
-#endif  // _MSV_VER
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif  // __BORLANDC__
 };
 template <typename From, typename To>
 const bool ImplicitlyConvertible<From, To>::value;
@@ -1043,11 +1017,10 @@ void CopyArray(const T* from, size_t size, U* to) {
 
 // The relation between an NativeArray object (see below) and the
 // native array it represents.
-enum RelationToSource {
-  kReference,  // The NativeArray references the native array.
-  kCopy        // The NativeArray makes a copy of the native array and
-               // owns the copy.
-};
+// We use 2 different structs to allow non-copyable types to be used, as long
+// as RelationToSourceReference() is passed.
+struct RelationToSourceReference {};
+struct RelationToSourceCopy {};
 
 // Adapts a native array to a read-only STL-style container.  Instead
 // of the complete STL container concept, this adaptor only implements
@@ -1065,22 +1038,23 @@ class NativeArray {
   typedef Element* iterator;
   typedef const Element* const_iterator;
 
-  // Constructs from a native array.
-  NativeArray(const Element* array, size_t count, RelationToSource relation) {
-    Init(array, count, relation);
+  // Constructs from a native array. References the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceReference) {
+    InitRef(array, count);
+  }
+
+  // Constructs from a native array. Copies the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
+    InitCopy(array, count);
   }
 
   // Copy constructor.
   NativeArray(const NativeArray& rhs) {
-    Init(rhs.array_, rhs.size_, rhs.relation_to_source_);
+    (this->*rhs.clone_)(rhs.array_, rhs.size_);
   }
 
   ~NativeArray() {
-    // Ensures that the user doesn't instantiate NativeArray with a
-    // const or reference type.
-    static_cast<void>(StaticAssertTypeEqHelper<Element,
-        GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>());
-    if (relation_to_source_ == kCopy)
+    if (clone_ != &NativeArray::InitRef)
       delete[] array_;
   }
 
@@ -1094,23 +1068,30 @@ class NativeArray {
   }
 
  private:
-  // Initializes this object; makes a copy of the input array if
-  // 'relation' is kCopy.
-  void Init(const Element* array, size_t a_size, RelationToSource relation) {
-    if (relation == kReference) {
-      array_ = array;
-    } else {
-      Element* const copy = new Element[a_size];
-      CopyArray(array, a_size, copy);
-      array_ = copy;
-    }
+  enum {
+    kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
+        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value,
+  };
+
+  // Initializes this object with a copy of the input.
+  void InitCopy(const Element* array, size_t a_size) {
+    Element* const copy = new Element[a_size];
+    CopyArray(array, a_size, copy);
+    array_ = copy;
     size_ = a_size;
-    relation_to_source_ = relation;
+    clone_ = &NativeArray::InitCopy;
+  }
+
+  // Initializes this object with a reference of the input.
+  void InitRef(const Element* array, size_t a_size) {
+    array_ = array;
+    size_ = a_size;
+    clone_ = &NativeArray::InitRef;
   }
 
   const Element* array_;
   size_t size_;
-  RelationToSource relation_to_source_;
+  void (NativeArray::*clone_)(const Element*, size_t);
 
   GTEST_DISALLOW_ASSIGN_(NativeArray);
 };
@@ -1245,6 +1226,7 @@ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
   ::test_info_ =\
     ::testing::internal::MakeAndRegisterTestInfo(\
         #test_case_name, #test_name, NULL, NULL, \
+        ::testing::internal::CodeLocation(__FILE__, __LINE__), \
         (parent_id), \
         parent_class::SetUpTestCase, \
         parent_class::TearDownTestCase, \
@@ -1253,3 +1235,4 @@ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
 void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-linked_ptr.h b/utils/unittest/googletest/include/gtest/internal/gtest-linked_ptr.h
index 57147b4e8be..36029422174 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-linked_ptr.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-linked_ptr.h
@@ -105,25 +105,35 @@ class linked_ptr_internal {
   // framework.
 
   // Join an existing circle.
-  // L < g_linked_ptr_mutex
-  void join(linked_ptr_internal const* ptr) {
+  void join(linked_ptr_internal const* ptr)
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
     MutexLock lock(&g_linked_ptr_mutex);
 
     linked_ptr_internal const* p = ptr;
-    while (p->next_ != ptr) p = p->next_;
+    while (p->next_ != ptr) {
+      assert(p->next_ != this &&
+             "Trying to join() a linked ring we are already in. "
+             "Is GMock thread safety enabled?");
+      p = p->next_;
+    }
     p->next_ = this;
     next_ = ptr;
   }
 
   // Leave whatever circle we're part of.  Returns true if we were the
   // last member of the circle.  Once this is done, you can join() another.
-  // L < g_linked_ptr_mutex
-  bool depart() {
+  bool depart()
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
     MutexLock lock(&g_linked_ptr_mutex);
 
     if (next_ == this) return true;
     linked_ptr_internal const* p = next_;
-    while (p->next_ != this) p = p->next_;
+    while (p->next_ != this) {
+      assert(p->next_ != next_ &&
+             "Trying to depart() a linked ring we are not in. "
+             "Is GMock thread safety enabled?");
+      p = p->next_;
+    }
     p->next_ = next_;
     return false;
   }
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-param-util-generated.h b/utils/unittest/googletest/include/gtest/internal/gtest-param-util-generated.h
index e32c762f75a..4d1d81d20ff 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-param-util-generated.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-param-util-generated.h
@@ -40,7 +40,7 @@
 // and at most 10 arguments in Combine. Please contact
 // googletestframework@googlegroups.com if you need more.
 // Please note that the number of arguments to Combine is limited
-// by the maximum arity of the implementation of tr1::tuple which is
+// by the maximum arity of the implementation of tuple which is
 // currently set at 10.
 
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
@@ -79,7 +79,10 @@ class ValueArray1 {
   explicit ValueArray1(T1 v1) : v1_(v1) {}
 
   template <typename T>
-  operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); }
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_)};
+    return ValuesIn(array);
+  }
 
  private:
   // No implementation - assignment is unsupported.
@@ -95,11 +98,14 @@ class ValueArray2 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
     return ValuesIn(array);
   }
 
  private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray2& other);
+
   const T1 v1_;
   const T2 v2_;
 };
@@ -111,7 +117,8 @@ class ValueArray3 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_)};
     return ValuesIn(array);
   }
 
@@ -132,7 +139,8 @@ class ValueArray4 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_)};
     return ValuesIn(array);
   }
 
@@ -154,7 +162,8 @@ class ValueArray5 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
     return ValuesIn(array);
   }
 
@@ -178,7 +187,9 @@ class ValueArray6 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_)};
     return ValuesIn(array);
   }
 
@@ -203,7 +214,9 @@ class ValueArray7 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_)};
     return ValuesIn(array);
   }
 
@@ -230,7 +243,9 @@ class ValueArray8 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
     return ValuesIn(array);
   }
 
@@ -258,7 +273,10 @@ class ValueArray9 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_)};
     return ValuesIn(array);
   }
 
@@ -287,7 +305,10 @@ class ValueArray10 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_)};
     return ValuesIn(array);
   }
 
@@ -318,7 +339,10 @@ class ValueArray11 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
     return ValuesIn(array);
   }
 
@@ -350,8 +374,11 @@ class ValueArray12 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_)};
     return ValuesIn(array);
   }
 
@@ -385,8 +412,11 @@ class ValueArray13 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_)};
     return ValuesIn(array);
   }
 
@@ -421,8 +451,11 @@ class ValueArray14 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
     return ValuesIn(array);
   }
 
@@ -458,8 +491,12 @@ class ValueArray15 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_)};
     return ValuesIn(array);
   }
 
@@ -498,8 +535,12 @@ class ValueArray16 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_)};
     return ValuesIn(array);
   }
 
@@ -539,8 +580,12 @@ class ValueArray17 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
     return ValuesIn(array);
   }
 
@@ -581,8 +626,13 @@ class ValueArray18 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_)};
     return ValuesIn(array);
   }
 
@@ -624,8 +674,13 @@ class ValueArray19 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_)};
     return ValuesIn(array);
   }
 
@@ -669,8 +724,13 @@ class ValueArray20 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
     return ValuesIn(array);
   }
 
@@ -716,8 +776,14 @@ class ValueArray21 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_)};
     return ValuesIn(array);
   }
 
@@ -764,8 +830,14 @@ class ValueArray22 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_)};
     return ValuesIn(array);
   }
 
@@ -814,9 +886,14 @@ class ValueArray23 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_,
-        v23_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
     return ValuesIn(array);
   }
 
@@ -866,9 +943,15 @@ class ValueArray24 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_)};
     return ValuesIn(array);
   }
 
@@ -919,9 +1002,15 @@ class ValueArray25 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_)};
     return ValuesIn(array);
   }
 
@@ -974,9 +1063,15 @@ class ValueArray26 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
     return ValuesIn(array);
   }
 
@@ -1031,9 +1126,16 @@ class ValueArray27 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_)};
     return ValuesIn(array);
   }
 
@@ -1089,9 +1191,16 @@ class ValueArray28 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_)};
     return ValuesIn(array);
   }
 
@@ -1148,9 +1257,16 @@ class ValueArray29 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
     return ValuesIn(array);
   }
 
@@ -1209,9 +1325,17 @@ class ValueArray30 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_)};
     return ValuesIn(array);
   }
 
@@ -1272,9 +1396,17 @@ class ValueArray31 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_)};
     return ValuesIn(array);
   }
 
@@ -1336,9 +1468,17 @@ class ValueArray32 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
     return ValuesIn(array);
   }
 
@@ -1402,9 +1542,18 @@ class ValueArray33 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_)};
     return ValuesIn(array);
   }
 
@@ -1469,9 +1618,18 @@ class ValueArray34 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_)};
     return ValuesIn(array);
   }
 
@@ -1537,10 +1695,18 @@ class ValueArray35 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_,
-        v35_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
     return ValuesIn(array);
   }
 
@@ -1608,10 +1774,19 @@ class ValueArray36 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_)};
     return ValuesIn(array);
   }
 
@@ -1681,10 +1856,19 @@ class ValueArray37 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_)};
     return ValuesIn(array);
   }
 
@@ -1755,10 +1939,19 @@ class ValueArray38 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
     return ValuesIn(array);
   }
 
@@ -1830,10 +2023,20 @@ class ValueArray39 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_)};
     return ValuesIn(array);
   }
 
@@ -1907,10 +2110,20 @@ class ValueArray40 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_)};
     return ValuesIn(array);
   }
 
@@ -1986,10 +2199,20 @@ class ValueArray41 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
     return ValuesIn(array);
   }
 
@@ -2066,10 +2289,21 @@ class ValueArray42 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_)};
     return ValuesIn(array);
   }
 
@@ -2147,10 +2381,21 @@ class ValueArray43 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_)};
     return ValuesIn(array);
   }
 
@@ -2230,10 +2475,21 @@ class ValueArray44 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
     return ValuesIn(array);
   }
 
@@ -2314,10 +2570,22 @@ class ValueArray45 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_)};
     return ValuesIn(array);
   }
 
@@ -2400,10 +2668,22 @@ class ValueArray46 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_)};
     return ValuesIn(array);
   }
 
@@ -2488,11 +2768,22 @@ class ValueArray47 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_,
-        v47_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
     return ValuesIn(array);
   }
 
@@ -2578,11 +2869,23 @@ class ValueArray48 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_, v47_,
-        v48_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_)};
     return ValuesIn(array);
   }
 
@@ -2669,11 +2972,23 @@ class ValueArray49 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_, v47_,
-        v48_, v49_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_)};
     return ValuesIn(array);
   }
 
@@ -2761,11 +3076,23 @@ class ValueArray50 {
 
   template <typename T>
   operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_, v47_,
-        v48_, v49_, v50_};
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
     return ValuesIn(array);
   }
 
@@ -2833,9 +3160,9 @@ class ValueArray50 {
 //
 template <typename T1, typename T2>
 class CartesianProductGenerator2
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2> > {
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2> > {
  public:
-  typedef ::std::tr1::tuple<T1, T2> ParamType;
+  typedef ::testing::tuple<T1, T2> ParamType;
 
   CartesianProductGenerator2(const ParamGenerator<T1>& g1,
       const ParamGenerator<T2>& g2)
@@ -2948,9 +3275,9 @@ class CartesianProductGenerator2
 
 template <typename T1, typename T2, typename T3>
 class CartesianProductGenerator3
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3> > {
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3> > {
  public:
-  typedef ::std::tr1::tuple<T1, T2, T3> ParamType;
+  typedef ::testing::tuple<T1, T2, T3> ParamType;
 
   CartesianProductGenerator3(const ParamGenerator<T1>& g1,
       const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
@@ -3080,9 +3407,9 @@ class CartesianProductGenerator3
 
 template <typename T1, typename T2, typename T3, typename T4>
 class CartesianProductGenerator4
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4> > {
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4> > {
  public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4> ParamType;
+  typedef ::testing::tuple<T1, T2, T3, T4> ParamType;
 
   CartesianProductGenerator4(const ParamGenerator<T1>& g1,
       const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
@@ -3231,9 +3558,9 @@ class CartesianProductGenerator4
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class CartesianProductGenerator5
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5> > {
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5> > {
  public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5> ParamType;
+  typedef ::testing::tuple<T1, T2, T3, T4, T5> ParamType;
 
   CartesianProductGenerator5(const ParamGenerator<T1>& g1,
       const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
@@ -3399,10 +3726,10 @@ class CartesianProductGenerator5
 template <typename T1, typename T2, typename T3, typename T4, typename T5,
     typename T6>
 class CartesianProductGenerator6
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5,
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5,
         T6> > {
  public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> ParamType;
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6> ParamType;
 
   CartesianProductGenerator6(const ParamGenerator<T1>& g1,
       const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
@@ -3585,10 +3912,10 @@ class CartesianProductGenerator6
 template <typename T1, typename T2, typename T3, typename T4, typename T5,
     typename T6, typename T7>
 class CartesianProductGenerator7
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
         T7> > {
  public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
 
   CartesianProductGenerator7(const ParamGenerator<T1>& g1,
       const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
@@ -3788,10 +4115,10 @@ class CartesianProductGenerator7
 template <typename T1, typename T2, typename T3, typename T4, typename T5,
     typename T6, typename T7, typename T8>
 class CartesianProductGenerator8
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
         T7, T8> > {
  public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
 
   CartesianProductGenerator8(const ParamGenerator<T1>& g1,
       const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
@@ -4010,10 +4337,10 @@ class CartesianProductGenerator8
 template <typename T1, typename T2, typename T3, typename T4, typename T5,
     typename T6, typename T7, typename T8, typename T9>
 class CartesianProductGenerator9
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
         T7, T8, T9> > {
  public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
 
   CartesianProductGenerator9(const ParamGenerator<T1>& g1,
       const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
@@ -4249,10 +4576,10 @@ class CartesianProductGenerator9
 template <typename T1, typename T2, typename T3, typename T4, typename T5,
     typename T6, typename T7, typename T8, typename T9, typename T10>
 class CartesianProductGenerator10
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
         T7, T8, T9, T10> > {
  public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
 
   CartesianProductGenerator10(const ParamGenerator<T1>& g1,
       const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
@@ -4514,8 +4841,8 @@ class CartesianProductHolder2 {
 CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
       : g1_(g1), g2_(g2) {}
   template <typename T1, typename T2>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2> >(
+  operator ParamGenerator< ::testing::tuple<T1, T2> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2> >(
         new CartesianProductGenerator2<T1, T2>(
         static_cast<ParamGenerator<T1> >(g1_),
         static_cast<ParamGenerator<T2> >(g2_)));
@@ -4536,8 +4863,8 @@ CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
     const Generator3& g3)
       : g1_(g1), g2_(g2), g3_(g3) {}
   template <typename T1, typename T2, typename T3>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >(
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3> >(
         new CartesianProductGenerator3<T1, T2, T3>(
         static_cast<ParamGenerator<T1> >(g1_),
         static_cast<ParamGenerator<T2> >(g2_),
@@ -4561,8 +4888,8 @@ CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
     const Generator3& g3, const Generator4& g4)
       : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
   template <typename T1, typename T2, typename T3, typename T4>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >(
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >(
         new CartesianProductGenerator4<T1, T2, T3, T4>(
         static_cast<ParamGenerator<T1> >(g1_),
         static_cast<ParamGenerator<T2> >(g2_),
@@ -4588,8 +4915,8 @@ CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
     const Generator3& g3, const Generator4& g4, const Generator5& g5)
       : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
   template <typename T1, typename T2, typename T3, typename T4, typename T5>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >(
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >(
         new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
         static_cast<ParamGenerator<T1> >(g1_),
         static_cast<ParamGenerator<T2> >(g2_),
@@ -4619,8 +4946,8 @@ CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
       : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
   template <typename T1, typename T2, typename T3, typename T4, typename T5,
       typename T6>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >(
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >(
         new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
         static_cast<ParamGenerator<T1> >(g1_),
         static_cast<ParamGenerator<T2> >(g2_),
@@ -4652,9 +4979,9 @@ CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
       : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
   template <typename T1, typename T2, typename T3, typename T4, typename T5,
       typename T6, typename T7>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6,
       T7> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> >(
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> >(
         new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
         static_cast<ParamGenerator<T1> >(g1_),
         static_cast<ParamGenerator<T2> >(g2_),
@@ -4690,9 +5017,9 @@ CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
           g8_(g8) {}
   template <typename T1, typename T2, typename T3, typename T4, typename T5,
       typename T6, typename T7, typename T8>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7,
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7,
       T8> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
         new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
         static_cast<ParamGenerator<T1> >(g1_),
         static_cast<ParamGenerator<T2> >(g2_),
@@ -4731,9 +5058,9 @@ CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
           g9_(g9) {}
   template <typename T1, typename T2, typename T3, typename T4, typename T5,
       typename T6, typename T7, typename T8, typename T9>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
       T9> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
         T9> >(
         new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
         static_cast<ParamGenerator<T1> >(g1_),
@@ -4775,10 +5102,10 @@ CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
           g9_(g9), g10_(g10) {}
   template <typename T1, typename T2, typename T3, typename T4, typename T5,
       typename T6, typename T7, typename T8, typename T9, typename T10>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-      T9, T10> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-        T9, T10> >(
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+      T10> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+        T10> >(
         new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
             T10>(
         static_cast<ParamGenerator<T1> >(g1_),
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-param-util.h b/utils/unittest/googletest/include/gtest/internal/gtest-param-util.h
index dea4d5cc6bc..82cab9b0201 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-param-util.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-param-util.h
@@ -34,7 +34,10 @@
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 
+#include <ctype.h>
+
 #include <iterator>
+#include <set>
 #include <utility>
 #include <vector>
 
@@ -49,6 +52,27 @@
 #if GTEST_HAS_PARAM_TEST
 
 namespace testing {
+
+// Input to a parameterized test name generator, describing a test parameter.
+// Consists of the parameter value and the integer parameter index.
+template <class ParamType>
+struct TestParamInfo {
+  TestParamInfo(const ParamType& a_param, size_t an_index) :
+    param(a_param),
+    index(an_index) {}
+  ParamType param;
+  size_t index;
+};
+
+// A builtin parameterized test name generator which returns the result of
+// testing::PrintToString.
+struct PrintToStringParamName {
+  template <class ParamType>
+  std::string operator()(const TestParamInfo<ParamType>& info) const {
+    return PrintToString(info.param);
+  }
+};
+
 namespace internal {
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -58,7 +82,7 @@ namespace internal {
 // TEST_P macro is used to define two tests with the same name
 // but in different namespaces.
 GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
-                                          const char* file, int line);
+                                          CodeLocation code_location);
 
 template <typename> class ParamGeneratorInterface;
 template <typename> class ParamGenerator;
@@ -206,7 +230,7 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
       return base_;
     }
     virtual void Advance() {
-      value_ = value_ + step_;
+      value_ = static_cast<T>(value_ + step_);
       index_++;
     }
     virtual ParamIteratorInterface<T>* Clone() const {
@@ -243,7 +267,7 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
                                const T& end,
                                const IncrementT& step) {
     int end_index = 0;
-    for (T i = begin; i < end; i = i + step)
+    for (T i = begin; i < end; i = static_cast<T>(i + step))
       end_index++;
     return end_index;
   }
@@ -270,12 +294,12 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
   template <typename ForwardIterator>
   ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
       : container_(begin, end) {}
-  ~ValuesInIteratorRangeGenerator() override {}
+  virtual ~ValuesInIteratorRangeGenerator() {}
 
-  ParamIteratorInterface<T> *Begin() const override {
+  virtual ParamIteratorInterface<T>* Begin() const {
     return new Iterator(this, container_.begin());
   }
-  ParamIteratorInterface<T> *End() const override {
+  virtual ParamIteratorInterface<T>* End() const {
     return new Iterator(this, container_.end());
   }
 
@@ -287,16 +311,16 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
     Iterator(const ParamGeneratorInterface<T>* base,
              typename ContainerType::const_iterator iterator)
         : base_(base), iterator_(iterator) {}
-    ~Iterator() override {}
+    virtual ~Iterator() {}
 
-    const ParamGeneratorInterface<T> *BaseGenerator() const override {
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
       return base_;
     }
-    void Advance() override {
+    virtual void Advance() {
       ++iterator_;
       value_.reset();
     }
-    ParamIteratorInterface<T> *Clone() const override {
+    virtual ParamIteratorInterface<T>* Clone() const {
       return new Iterator(*this);
     }
     // We need to use cached value referenced by iterator_ because *iterator_
@@ -306,12 +330,12 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
     // can advance iterator_ beyond the end of the range, and we cannot
     // detect that fact. The client code, on the other hand, is
     // responsible for not calling Current() on an out-of-range iterator.
-    const T *Current() const override {
+    virtual const T* Current() const {
       if (value_.get() == NULL)
         value_.reset(new T(*iterator_));
       return value_.get();
     }
-    bool Equals(const ParamIteratorInterface<T> &other) const override {
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
       GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
@@ -345,6 +369,37 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
   const ContainerType container_;
 };  // class ValuesInIteratorRangeGenerator
 
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Default parameterized test name generator, returns a string containing the
+// integer test parameter index.
+template <class ParamType>
+std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
+  Message name_stream;
+  name_stream << info.index;
+  return name_stream.GetString();
+}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Parameterized test name overload helpers, which help the
+// INSTANTIATE_TEST_CASE_P macro choose between the default parameterized
+// test name generator and user param name generator.
+template <class ParamType, class ParamNameGenFunctor>
+ParamNameGenFunctor GetParamNameGen(ParamNameGenFunctor func) {
+  return func;
+}
+
+template <class ParamType>
+struct ParamNameGenFunc {
+  typedef std::string Type(const TestParamInfo<ParamType>&);
+};
+
+template <class ParamType>
+typename ParamNameGenFunc<ParamType>::Type *GetParamNameGen() {
+  return DefaultParamName;
+}
+
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Stores a parameter value and later creates tests parameterized with that
@@ -355,7 +410,7 @@ class ParameterizedTestFactory : public TestFactoryBase {
   typedef typename TestClass::ParamType ParamType;
   explicit ParameterizedTestFactory(ParamType parameter) :
       parameter_(parameter) {}
-  Test *CreateTest() override {
+  virtual Test* CreateTest() {
     TestClass::SetParam(&parameter_);
     return new TestClass();
   }
@@ -394,7 +449,7 @@ class TestMetaFactory
 
   TestMetaFactory() {}
 
-  TestFactoryBase *CreateTestFactory(ParamType parameter) override {
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
     return new ParameterizedTestFactory<TestCase>(parameter);
   }
 
@@ -414,7 +469,7 @@ class TestMetaFactory
 // and calls RegisterTests() on each of them when asked.
 class ParameterizedTestCaseInfoBase {
  public:
-  virtual ~ParameterizedTestCaseInfoBase();
+  virtual ~ParameterizedTestCaseInfoBase() {}
 
   // Base part of test case name for display purposes.
   virtual const string& GetTestCaseName() const = 0;
@@ -449,14 +504,16 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
   typedef typename TestCase::ParamType ParamType;
   // A function that returns an instance of appropriate generator type.
   typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+  typedef typename ParamNameGenFunc<ParamType>::Type ParamNameGeneratorFunc;
 
-  explicit ParameterizedTestCaseInfo(const char* name)
-      : test_case_name_(name) {}
+  explicit ParameterizedTestCaseInfo(
+      const char* name, CodeLocation code_location)
+      : test_case_name_(name), code_location_(code_location) {}
 
   // Test case base name for display purposes.
-  const string &GetTestCaseName() const override { return test_case_name_; }
+  virtual const string& GetTestCaseName() const { return test_case_name_; }
   // Test case id to verify identity.
-  TypeId GetTestCaseTypeId() const override { return GetTypeId<TestCase>(); }
+  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
   // TEST_P macro uses AddTestPattern() to record information
   // about a single test in a LocalTestInfo structure.
   // test_case_name is the base name of the test case (without invocation
@@ -474,9 +531,11 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
   // about a generator.
   int AddTestCaseInstantiation(const string& instantiation_name,
                                GeneratorCreationFunc* func,
-                               const char* /* file */,
-                               int /* line */) {
-    instantiations_.push_back(::std::make_pair(instantiation_name, func));
+                               ParamNameGeneratorFunc* name_func,
+                               const char* file,
+                               int line) {
+    instantiations_.push_back(
+        InstantiationInfo(instantiation_name, func, name_func, file, line));
     return 0;  // Return value used only to run this method in namespace scope.
   }
   // UnitTest class invokes this method to register tests in this test case
@@ -484,32 +543,52 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
   // This method should not be called more then once on any single
   // instance of a ParameterizedTestCaseInfoBase derived class.
   // UnitTest has a guard to prevent from calling this method more then once.
-  void RegisterTests() override {
+  virtual void RegisterTests() {
     for (typename TestInfoContainer::iterator test_it = tests_.begin();
          test_it != tests_.end(); ++test_it) {
       linked_ptr<TestInfo> test_info = *test_it;
       for (typename InstantiationContainer::iterator gen_it =
                instantiations_.begin(); gen_it != instantiations_.end();
                ++gen_it) {
-        const string& instantiation_name = gen_it->first;
-        ParamGenerator<ParamType> generator((*gen_it->second)());
+        const string& instantiation_name = gen_it->name;
+        ParamGenerator<ParamType> generator((*gen_it->generator)());
+        ParamNameGeneratorFunc* name_func = gen_it->name_func;
+        const char* file = gen_it->file;
+        int line = gen_it->line;
 
-        Message test_case_name_stream;
+        string test_case_name;
         if ( !instantiation_name.empty() )
-          test_case_name_stream << instantiation_name << "/";
-        test_case_name_stream << test_info->test_case_base_name;
+          test_case_name = instantiation_name + "/";
+        test_case_name += test_info->test_case_base_name;
 
-        int i = 0;
+        size_t i = 0;
+        std::set<std::string> test_param_names;
         for (typename ParamGenerator<ParamType>::iterator param_it =
                  generator.begin();
              param_it != generator.end(); ++param_it, ++i) {
           Message test_name_stream;
-          test_name_stream << test_info->test_base_name << "/" << i;
+
+          std::string param_name = name_func(
+              TestParamInfo<ParamType>(*param_it, i));
+
+          GTEST_CHECK_(IsValidParamName(param_name))
+              << "Parameterized test name '" << param_name
+              << "' is invalid, in " << file
+              << " line " << line << std::endl;
+
+          GTEST_CHECK_(test_param_names.count(param_name) == 0)
+              << "Duplicate parameterized test name '" << param_name
+              << "', in " << file << " line " << line << std::endl;
+
+          test_param_names.insert(param_name);
+
+          test_name_stream << test_info->test_base_name << "/" << param_name;
           MakeAndRegisterTestInfo(
-              test_case_name_stream.GetString().c_str(),
+              test_case_name.c_str(),
               test_name_stream.GetString().c_str(),
               NULL,  // No type parameter.
               PrintToString(*param_it).c_str(),
+              code_location_,
               GetTestCaseTypeId(),
               TestCase::SetUpTestCase,
               TestCase::TearDownTestCase,
@@ -535,12 +614,45 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
     const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
   };
   typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
-  // Keeps pairs of <Instantiation name, Sequence generator creation function>
-  // received from INSTANTIATE_TEST_CASE_P macros.
-  typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> >
-      InstantiationContainer;
+  // Records data received from INSTANTIATE_TEST_CASE_P macros:
+  //  <Instantiation name, Sequence generator creation function,
+  //     Name generator function, Source file, Source line>
+  struct InstantiationInfo {
+      InstantiationInfo(const std::string &name_in,
+                        GeneratorCreationFunc* generator_in,
+                        ParamNameGeneratorFunc* name_func_in,
+                        const char* file_in,
+                        int line_in)
+          : name(name_in),
+            generator(generator_in),
+            name_func(name_func_in),
+            file(file_in),
+            line(line_in) {}
+
+      std::string name;
+      GeneratorCreationFunc* generator;
+      ParamNameGeneratorFunc* name_func;
+      const char* file;
+      int line;
+  };
+  typedef ::std::vector<InstantiationInfo> InstantiationContainer;
+
+  static bool IsValidParamName(const std::string& name) {
+    // Check for empty string
+    if (name.empty())
+      return false;
+
+    // Check for invalid characters
+    for (std::string::size_type index = 0; index < name.size(); ++index) {
+      if (!isalnum(name[index]) && name[index] != '_')
+        return false;
+    }
+
+    return true;
+  }
 
   const string test_case_name_;
+  CodeLocation code_location_;
   TestInfoContainer tests_;
   InstantiationContainer instantiations_;
 
@@ -568,8 +680,7 @@ class ParameterizedTestCaseRegistry {
   template <class TestCase>
   ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
       const char* test_case_name,
-      const char* file,
-      int line) {
+      CodeLocation code_location) {
     ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
     for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
          it != test_case_infos_.end(); ++it) {
@@ -578,7 +689,7 @@ class ParameterizedTestCaseRegistry {
           // Complain about incorrect usage of Google Test facilities
           // and terminate the program since we cannot guaranty correct
           // test case setup and tear-down in this case.
-          ReportInvalidTestCaseType(test_case_name,  file, line);
+          ReportInvalidTestCaseType(test_case_name, code_location);
           posix::Abort();
         } else {
           // At this point we are sure that the object we found is of the same
@@ -591,7 +702,8 @@ class ParameterizedTestCaseRegistry {
       }
     }
     if (typed_test_info == NULL) {
-      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name);
+      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(
+          test_case_name, code_location);
       test_case_infos_.push_back(typed_test_info);
     }
     return typed_test_info;
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-port-arch.h b/utils/unittest/googletest/include/gtest/internal/gtest-port-arch.h
new file mode 100644
index 00000000000..a375b73799b
--- /dev/null
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-port-arch.h
@@ -0,0 +1,97 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the GTEST_OS_* macro.
+// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+# define GTEST_OS_CYGWIN 1
+#elif defined __SYMBIAN32__
+# define GTEST_OS_SYMBIAN 1
+#elif defined _WIN32
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(__MINGW__) || defined(__MINGW32__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+# elif defined(WINAPI_FAMILY)
+#  include <winapifamily.h>
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#   define GTEST_OS_WINDOWS_RT 1
+#  else
+    // WINAPI_FAMILY defined but no known partition matched.
+    // Default to desktop.
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  endif
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
+#elif defined __APPLE__
+# define GTEST_OS_MAC 1
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+# endif
+#elif defined __FreeBSD__
+# define GTEST_OS_FREEBSD 1
+#elif defined __linux__
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
+#elif defined __MVS__
+# define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+# define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+# define GTEST_OS_AIX 1
+#elif defined(__hpux)
+# define GTEST_OS_HPUX 1
+#elif defined __native_client__
+# define GTEST_OS_NACL 1
+#elif defined __OpenBSD__
+# define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+# define GTEST_OS_QNX 1
+#elif defined(__HAIKU__)
+# define GTEST_OS_HAIKU 1
+#elif defined(_MINIX)
+# define GTEST_OS_MINIX 1
+#endif  // __CYGWIN__
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-port.h b/utils/unittest/googletest/include/gtest/internal/gtest-port.h
index cac04a7bb72..8762f974ef8 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-port.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-port.h
@@ -30,15 +30,43 @@
 // Authors: wan@google.com (Zhanyong Wan)
 //
 // Low-level types and utilities for porting Google Test to various
-// platforms.  They are subject to change without notice.  DO NOT USE
-// THEM IN USER CODE.
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Test MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Test's public API and can be used by
+// code outside Google Test.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
 
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
-// The user can define the following macros in the build script to
-// control Google Test's behavior.  If the user doesn't define a macro
-// in this list, Google Test will define it.
+// Environment-describing macros
+// -----------------------------
+//
+// Google Test can be used in many different environments.  Macros in
+// this section tell Google Test what kind of environment it is being
+// used in, such that Google Test can provide environment-specific
+// features and implementations.
+//
+// Google Test tries to automatically detect the properties of its
+// environment, so users usually don't need to worry about these
+// macros.  However, the automatic detection is not perfect.
+// Sometimes it's necessary for a user to define some of the following
+// macros in the build script to override Google Test's decisions.
+//
+// If the user doesn't define a macro in the list, Google Test will
+// provide a default definition.  After this header is #included, all
+// macros in this list will be defined to either 1 or 0.
+//
+// Notes to maintainers:
+//   - Each macro here is a user-tweakable knob; do not grow the list
+//     lightly.
+//   - Use #if to key off these macros.  Don't use #ifdef or "#if
+//     defined(...)", which will not work as these macros are ALWAYS
+//     defined.
 //
 //   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
 //                              is/isn't available.
@@ -72,6 +100,8 @@
 //                              Test's own tr1 tuple implementation should be
 //                              used.  Unused when the user sets
 //                              GTEST_HAS_TR1_TUPLE to 0.
+//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
+//                              is building in C++11/C++98 mode.
 //   GTEST_LINKED_AS_SHARED_LIBRARY
 //                            - Define to 1 when compiling tests that use
 //                              Google Test as a shared library (known as
@@ -80,10 +110,15 @@
 //                            - Define to 1 when compiling Google Test itself
 //                              as a shared library.
 
-// This header defines the following utilities:
+// Platform-indicating macros
+// --------------------------
+//
+// Macros indicating the platform on which Google Test is being used
+// (a macro is defined to 1 if compiled on the given platform;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
 //
-// Macros indicating the current platform (defined to 1 if compiled on
-// the given platform; otherwise undefined):
 //   GTEST_OS_AIX      - IBM AIX
 //   GTEST_OS_CYGWIN   - Cygwin
 //   GTEST_OS_FREEBSD  - FreeBSD
@@ -92,14 +127,19 @@
 //   GTEST_OS_LINUX    - Linux
 //     GTEST_OS_LINUX_ANDROID - Google Android
 //   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
 //   GTEST_OS_MINIX    - Minix
 //   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_QNX      - QNX
 //   GTEST_OS_SOLARIS  - Sun Solaris
 //   GTEST_OS_SYMBIAN  - Symbian
 //   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
 //     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
 //     GTEST_OS_WINDOWS_MINGW    - MinGW
 //     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//     GTEST_OS_WINDOWS_PHONE    - Windows Phone
+//     GTEST_OS_WINDOWS_RT       - Windows Store App/WinRT
 //   GTEST_OS_ZOS      - z/OS
 //
 // Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
@@ -109,22 +149,50 @@
 // googletestframework@googlegroups.com (patches for fixing them are
 // even more welcome!).
 //
-// Note that it is possible that none of the GTEST_OS_* macros are defined.
+// It is possible that none of the GTEST_OS_* macros are defined.
+
+// Feature-indicating macros
+// -------------------------
+//
+// Macros indicating which Google Test features are available (a macro
+// is defined to 1 if the corresponding feature is supported;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+// These macros are public so that portable tests can be written.
+// Such tests typically surround code using a feature with an #if
+// which controls that code.  For example:
+//
+// #if GTEST_HAS_DEATH_TEST
+//   EXPECT_DEATH(DoSomethingDeadly());
+// #endif
 //
-// Macros indicating available Google Test features (defined to 1 if
-// the corresponding feature is supported; otherwise undefined):
 //   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
 //                            tests)
 //   GTEST_HAS_DEATH_TEST   - death tests
 //   GTEST_HAS_PARAM_TEST   - value-parameterized tests
 //   GTEST_HAS_TYPED_TEST   - typed tests
 //   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
 //   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
 //                            GTEST_HAS_POSIX_RE (see above) which users can
 //                            define themselves.
 //   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
 //                            the above two are mutually exclusive.
 //   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
+
+// Misc public macros
+// ------------------
+//
+//   GTEST_FLAG(flag_name)  - references the variable corresponding to
+//                            the given Google Test flag.
+
+// Internal utilities
+// ------------------
+//
+// The following macros and utilities are for Google Test's INTERNAL
+// use only.  Code outside Google Test MUST NOT USE THEM DIRECTLY.
 //
 // Macros for basic C++ coding:
 //   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
@@ -133,13 +201,18 @@
 //   GTEST_DISALLOW_ASSIGN_   - disables operator=.
 //   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
 //   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
+//                                        suppressed (constant conditional).
+//   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
+//                                        is suppressed.
+//
+// C++11 feature wrappers:
+//
+//   testing::internal::move  - portability wrapper for std::move.
 //
 // Synchronization:
 //   Mutex, MutexLock, ThreadLocal, GetThreadCount()
-//                  - synchronization primitives.
-//   GTEST_IS_THREADSAFE - defined to 1 to indicate that the above
-//                         synchronization primitives have real implementations
-//                         and Google Test is thread-safe; or 0 otherwise.
+//                            - synchronization primitives.
 //
 // Template meta programming:
 //   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
@@ -175,10 +248,9 @@
 //   BiggestInt     - the biggest signed integer type.
 //
 // Command-line utilities:
-//   GTEST_FLAG()       - references a flag.
 //   GTEST_DECLARE_*()  - declares a flag.
 //   GTEST_DEFINE_*()   - defines a flag.
-//   GetArgvs()         - returns the command line as a vector of strings.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
 //
 // Environment variable utilities:
 //   GetEnv()             - gets the value of an environment variable.
@@ -196,16 +268,33 @@
 # include <sys/stat.h>
 #endif  // !_WIN32_WCE
 
+#if defined __APPLE__
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
+#endif
+
+#include <algorithm>  // NOLINT
 #include <iostream>  // NOLINT
 #include <sstream>  // NOLINT
 #include <string>  // NOLINT
+#include <utility>
+#include <vector>  // NOLINT
+
+#include "gtest/internal/gtest-port-arch.h"
+#include "gtest/internal/custom/gtest-port.h"
+
+#if !defined(GTEST_DEV_EMAIL_)
+# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+# define GTEST_FLAG_PREFIX_ "gtest_"
+# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+# define GTEST_NAME_ "Google Test"
+# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#endif  // !defined(GTEST_DEV_EMAIL_)
 
-#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-#define GTEST_FLAG_PREFIX_ "gtest_"
-#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-#define GTEST_NAME_ "Google Test"
-#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
+#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
 
 // Determines the version of gcc that is used to compile this.
 #ifdef __GNUC__
@@ -214,70 +303,132 @@
     (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
 #endif  // __GNUC__
 
-// Determines the platform on which Google Test is compiled.
-#ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-#elif defined __SYMBIAN32__
-# define GTEST_OS_SYMBIAN 1
-#elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-#  define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(__MINGW__) || defined(__MINGW32__)
-#  define GTEST_OS_WINDOWS_MINGW 1
+// Macros for disabling Microsoft Visual C++ warnings.
+//
+//   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
+//   /* code that triggers warnings C4800 and C4385 */
+//   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#if _MSC_VER >= 1500
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+    __pragma(warning(push))                        \
+    __pragma(warning(disable: warnings))
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
+    __pragma(warning(pop))
+#else
+// Older versions of MSVC don't have __pragma.
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+#ifndef GTEST_LANG_CXX11
+// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
+// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
+// value for __cplusplus, and recent versions of clang, gcc, and
+// probably other compilers set that too in C++11 mode.
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+// Compiling in at least C++11 mode.
+#  define GTEST_LANG_CXX11 1
 # else
-#  define GTEST_OS_WINDOWS_DESKTOP 1
-# endif  // _WIN32_WCE
-#elif defined __APPLE__
-# define GTEST_OS_MAC 1
-#elif defined __FreeBSD__
-# define GTEST_OS_FREEBSD 1
-#elif defined __linux__
-# define GTEST_OS_LINUX 1
-# if defined(ANDROID) || defined(__ANDROID__)
-#  define GTEST_OS_LINUX_ANDROID 1
-# endif  // ANDROID
-#elif defined __MVS__
-# define GTEST_OS_ZOS 1
-#elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
-#elif defined(_AIX)
-# define GTEST_OS_AIX 1
-#elif defined(__hpux)
-# define GTEST_OS_HPUX 1
-#elif defined __native_client__
-# define GTEST_OS_NACL 1
-#elif defined(__HAIKU__)
-# define GTEST_OS_HAIKU 1
-#elif defined(_MINIX)
-# define GTEST_OS_MINIX 1
-#endif  // __CYGWIN__
+#  define GTEST_LANG_CXX11 0
+# endif
+#endif
+
+// Distinct from C++11 language support, some environments don't provide
+// proper C++11 library support. Notably, it's possible to build in
+// C++11 mode when targeting Mac OS X 10.6, which has an old libstdc++
+// with no C++11 support.
+//
+// libstdc++ has sufficient C++11 support as of GCC 4.6.0, __GLIBCXX__
+// 20110325, but maintenance releases in the 4.4 and 4.5 series followed
+// this date, so check for those versions by their date stamps.
+// https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning
+#if GTEST_LANG_CXX11 && \
+    (!defined(__GLIBCXX__) || ( \
+        __GLIBCXX__ >= 20110325ul &&  /* GCC >= 4.6.0 */ \
+        /* Blacklist of patch releases of older branches: */ \
+        __GLIBCXX__ != 20110416ul &&  /* GCC 4.4.6 */ \
+        __GLIBCXX__ != 20120313ul &&  /* GCC 4.4.7 */ \
+        __GLIBCXX__ != 20110428ul &&  /* GCC 4.5.3 */ \
+        __GLIBCXX__ != 20120702ul))   /* GCC 4.5.4 */
+# define GTEST_STDLIB_CXX11 1
+#endif
+
+// Only use C++11 library features if the library provides them.
+#if GTEST_STDLIB_CXX11
+# define GTEST_HAS_STD_BEGIN_AND_END_ 1
+# define GTEST_HAS_STD_FORWARD_LIST_ 1
+# define GTEST_HAS_STD_FUNCTION_ 1
+# define GTEST_HAS_STD_INITIALIZER_LIST_ 1
+# define GTEST_HAS_STD_MOVE_ 1
+# define GTEST_HAS_STD_SHARED_PTR_ 1
+# define GTEST_HAS_STD_TYPE_TRAITS_ 1
+# define GTEST_HAS_STD_UNIQUE_PTR_ 1
+#endif
+
+// C++11 specifies that <tuple> provides std::tuple.
+// Some platforms still might not have it, however.
+#if GTEST_LANG_CXX11
+# define GTEST_HAS_STD_TUPLE_ 1
+# if defined(__clang__)
+// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include
+#  if defined(__has_include) && !__has_include(<tuple>)
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# elif defined(_MSC_VER)
+// Inspired by boost/config/stdlib/dinkumware.hpp
+#  if defined(_CPPLIB_VER) && _CPPLIB_VER < 520
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# elif defined(__GLIBCXX__)
+// Inspired by boost/config/stdlib/libstdcpp3.hpp,
+// http://gcc.gnu.org/gcc-4.2/changes.html and
+// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# endif
+#endif
 
 // Brings in definitions for functions used in the testing::internal::posix
 // namespace (read, write, close, chdir, isatty, stat). We do not currently
 // use them on Windows Mobile.
-#if !GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS_MOBILE
+#  include <direct.h>
+#  include <io.h>
+# endif
+// In order to avoid having to include <windows.h>, use forward declaration
+// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+// This assumption is verified by
+// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
+struct _RTL_CRITICAL_SECTION;
+#else
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
 // mentioned above.
 # include <unistd.h>
-# if !GTEST_OS_NACL
-// TODO(vladl@google.com): Remove this condition when Native Client SDK adds
-// strings.h (tracked in
-// http://code.google.com/p/nativeclient/issues/detail?id=1175).
-#  include <strings.h>  // Native Client doesn't provide strings.h.
-# endif
-#elif !GTEST_OS_WINDOWS_MOBILE
-# include <direct.h>
-# include <io.h>
+# include <strings.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#  include <android/api-level.h>  // NOLINT
 #endif
 
 // Defines this to true iff Google Test can use POSIX regular expressions.
 #ifndef GTEST_HAS_POSIX_RE
-# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# endif
 #endif
 
-#if GTEST_HAS_POSIX_RE
+#if GTEST_USES_PCRE
+// The appropriate headers have already been included.
+
+#elif GTEST_HAS_POSIX_RE
 
 // On some platforms, <regex.h> needs someone to define size_t, and
 // won't compile otherwise.  We can #include it here as we already
@@ -299,7 +450,7 @@
 // simple regex implementation instead.
 # define GTEST_USES_SIMPLE_RE 1
 
-#endif  // GTEST_HAS_POSIX_RE
+#endif  // GTEST_USES_PCRE
 
 #ifndef GTEST_HAS_EXCEPTIONS
 // The user didn't tell us whether exceptions are enabled, so we need
@@ -312,6 +463,15 @@
 #   define _HAS_EXCEPTIONS 1
 #  endif  // _HAS_EXCEPTIONS
 #  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__clang__)
+// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714,
+// but iff cleanups are enabled after that. In Obj-C++ files, there can be
+// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions
+// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++
+// exceptions starting at clang r206352, but which checked for cleanups prior to
+// that. To reliably check for C++ exception availability with clang, check for
+// __EXCEPTIONS && __has_feature(cxx_exceptions).
+#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
 # elif defined(__GNUC__) && __EXCEPTIONS
 // gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
 #  define GTEST_HAS_EXCEPTIONS 1
@@ -390,11 +550,27 @@
 # elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
 
 #  ifdef __GXX_RTTI
-#   define GTEST_HAS_RTTI 1
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
 #  else
 #   define GTEST_HAS_RTTI 0
 #  endif  // __GXX_RTTI
 
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+# elif defined(__clang__)
+
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
 // Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
 // both the typeid and dynamic_cast features are present.
 # elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
@@ -422,13 +598,13 @@
 
 // Determines whether Google Test can use the pthreads library.
 #ifndef GTEST_HAS_PTHREAD
-// The user didn't tell us explicitly, so we assume pthreads support is
-// available on Linux and Mac.
+// The user didn't tell us explicitly, so we make reasonable assumptions about
+// which platforms have pthreads support.
 //
 // To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
 // to your compiler flags.
-# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || \
-          GTEST_OS_HPUX || GTEST_OS_FREEBSD)
+# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
+    || GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL)
 #endif  // GTEST_HAS_PTHREAD
 
 #if GTEST_HAS_PTHREAD
@@ -440,12 +616,26 @@
 # include <time.h>  // NOLINT
 #endif
 
+// Determines if hash_map/hash_set are available.
+// Only used for testing against those containers.
+#if !defined(GTEST_HAS_HASH_MAP_)
+# if _MSC_VER
+#  define GTEST_HAS_HASH_MAP_ 1  // Indicates that hash_map is available.
+#  define GTEST_HAS_HASH_SET_ 1  // Indicates that hash_set is available.
+# endif  // _MSC_VER
+#endif  // !defined(GTEST_HAS_HASH_MAP_)
+
 // Determines whether Google Test can use tr1/tuple.  You can define
 // this macro to 0 to prevent Google Test from using tuple (any
 // feature depending on tuple with be disabled in this mode).
 #ifndef GTEST_HAS_TR1_TUPLE
+# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
+// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
+#  define GTEST_HAS_TR1_TUPLE 0
+# else
 // The user didn't tell us not to do it, so we assume it's OK.
-# define GTEST_HAS_TR1_TUPLE 1
+#  define GTEST_HAS_TR1_TUPLE 1
+# endif
 #endif  // GTEST_HAS_TR1_TUPLE
 
 // Determines whether Google Test's own tr1 tuple implementation
@@ -454,15 +644,28 @@
 // The user didn't tell us, so we need to figure it out.
 
 // We use our own TR1 tuple if we aren't sure the user has an
-// implementation of it already.  At this time, GCC 4.0.0+ and MSVC
-// 2010 are the only mainstream compilers that come with a TR1 tuple
-// implementation.  NVIDIA's CUDA NVCC compiler pretends to be GCC by
-// defining __GNUC__ and friends, but cannot compile GCC's tuple
-// implementation.  MSVC 2008 (9.0) provides TR1 tuple in a 323 MB
-// Feature Pack download, which we cannot assume the user has.
-# if (defined(__GNUC__) && !(defined(__CUDACC__) || defined(__clang__)) \
-                        && (GTEST_GCC_VER_ >= 40000)) \
-    || _MSC_VER >= 1600
+// implementation of it already.  At this time, libstdc++ 4.0.0+ and
+// MSVC 2010 are the only mainstream standard libraries that come
+// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
+// pretends to be GCC by defining __GNUC__ and friends, but cannot
+// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
+// tuple in a 323 MB Feature Pack download, which we cannot assume the
+// user has.  QNX's QCC compiler is a modified GCC but it doesn't
+// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
+// and it can be used with some compilers that define __GNUC__.
+# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
+# endif
+
+// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
+// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
+// can build with clang but need to use gcc4.2's libstdc++).
+# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
+#  define GTEST_ENV_HAS_STD_TUPLE_ 1
+# endif
+
+# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
 #  define GTEST_USE_OWN_TR1_TUPLE 0
 # else
 #  define GTEST_USE_OWN_TR1_TUPLE 1
@@ -472,11 +675,37 @@
 
 // To avoid conditional compilation everywhere, we make it
 // gtest-port.h's responsibility to #include the header implementing
-// tr1/tuple.
+// tuple.
+#if GTEST_HAS_STD_TUPLE_
+# include <tuple>  // IWYU pragma: export
+# define GTEST_TUPLE_NAMESPACE_ ::std
+#endif  // GTEST_HAS_STD_TUPLE_
+
+// We include tr1::tuple even if std::tuple is available to define printers for
+// them.
 #if GTEST_HAS_TR1_TUPLE
+# ifndef GTEST_TUPLE_NAMESPACE_
+#  define GTEST_TUPLE_NAMESPACE_ ::std::tr1
+# endif  // GTEST_TUPLE_NAMESPACE_
 
 # if GTEST_USE_OWN_TR1_TUPLE
-#  include "gtest/internal/gtest-tuple.h"
+#  include "gtest/internal/gtest-tuple.h"  // IWYU pragma: export  // NOLINT
+# elif GTEST_ENV_HAS_STD_TUPLE_
+#  include <tuple>
+// C++11 puts its tuple into the ::std namespace rather than
+// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
+// This causes undefined behavior, but supported compilers react in
+// the way we intend.
+namespace std {
+namespace tr1 {
+using ::std::get;
+using ::std::make_tuple;
+using ::std::tuple;
+using ::std::tuple_element;
+using ::std::tuple_size;
+}
+}
+
 # elif GTEST_OS_SYMBIAN
 
 // On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
@@ -491,7 +720,7 @@
 // This prevents <boost/tr1/detail/config.hpp>, which defines
 // BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
 #  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
-#  include <tuple>
+#  include <tuple>  // IWYU pragma: export  // NOLINT
 
 # elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
 // GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
@@ -506,7 +735,7 @@
 #   define _TR1_FUNCTIONAL 1
 #   include <tr1/tuple>
 #   undef _TR1_FUNCTIONAL  // Allows the user to #include
-                        // <tr1/functional> if they choose to.
+                        // <tr1/functional> if he chooses to.
 #  else
 #   include <tr1/tuple>  // NOLINT
 #  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
@@ -514,7 +743,7 @@
 # else
 // If the compiler is not GCC 4.0+, we assume the user is using a
 // spec-conforming TR1 implementation.
-#  include <tuple>  // NOLINT
+#  include <tuple>  // IWYU pragma: export  // NOLINT
 # endif  // GTEST_USE_OWN_TR1_TUPLE
 
 #endif  // GTEST_HAS_TR1_TUPLE
@@ -527,7 +756,16 @@
 // The user didn't tell us, so we need to figure it out.
 
 # if GTEST_OS_LINUX && !defined(__ia64__)
-#  define GTEST_HAS_CLONE 1
+#  if GTEST_OS_LINUX_ANDROID
+// On Android, clone() is only available on ARM starting with Gingerbread.
+#    if defined(__arm__) && __ANDROID_API__ >= 9
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
 # else
 #  define GTEST_HAS_CLONE 0
 # endif  // GTEST_OS_LINUX && !defined(__ia64__)
@@ -539,7 +777,8 @@
 #ifndef GTEST_HAS_STREAM_REDIRECTION
 // By default, we assume that stream redirection is supported on all
 // platforms except known mobile ones.
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
 #  define GTEST_HAS_STREAM_REDIRECTION 0
 # else
 #  define GTEST_HAS_STREAM_REDIRECTION 1
@@ -550,12 +789,12 @@
 // Google Test does not support death tests for VC 7.1 and earlier as
 // abort() in a VC 7.1 application compiled as GUI in debug config
 // pops up a dialog window that cannot be suppressed programmatically.
-#if (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) || \
      (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
-     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || \
-     GTEST_OS_HPUX || GTEST_OS_FREEBSD)
+     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
+     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD)
 # define GTEST_HAS_DEATH_TEST 1
-# include <vector>  // NOLINT
 #endif
 
 // We don't support MSVC 7.1 with exceptions disabled now.  Therefore
@@ -619,7 +858,12 @@
 // compiler the variable/parameter does not have to be used.
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
 # define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-#else
+#elif defined(__clang__)
+# if __has_attribute(unused)
+#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+# endif
+#endif
+#ifndef GTEST_ATTRIBUTE_UNUSED_
 # define GTEST_ATTRIBUTE_UNUSED_
 #endif
 
@@ -645,6 +889,19 @@
 # define GTEST_MUST_USE_RESULT_
 #endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
 
+// MS C++ compiler emits warning when a conditional expression is compile time
+// constant. In some contexts this warning is false positive and needs to be
+// suppressed. Use the following two macros in such cases:
+//
+// GTEST_INTENTIONAL_CONST_COND_PUSH_()
+// while (true) {
+// GTEST_INTENTIONAL_CONST_COND_POP_()
+// }
+# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+# define GTEST_INTENTIONAL_CONST_COND_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+
 // Determine whether the compiler supports Microsoft's Structured Exception
 // Handling.  This is supported by several Windows compilers but generally
 // does not exist on any other system.
@@ -661,15 +918,20 @@
 
 #endif  // GTEST_HAS_SEH
 
-#ifdef _MSC_VER
+#define GTEST_IS_THREADSAFE \
+    (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ \
+     || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \
+     || GTEST_HAS_PTHREAD)
 
+#ifdef _MSC_VER
 # if GTEST_LINKED_AS_SHARED_LIBRARY
 #  define GTEST_API_ __declspec(dllimport)
 # elif GTEST_CREATE_SHARED_LIBRARY
 #  define GTEST_API_ __declspec(dllexport)
 # endif
-
-#endif  // _MSC_VER
+#elif __GNUC__ >= 4 || defined(__clang__)
+# define GTEST_API_ __attribute__((visibility ("default")))
+#endif // _MSC_VER
 
 #ifndef GTEST_API_
 # define GTEST_API_
@@ -682,20 +944,84 @@
 # define GTEST_NO_INLINE_
 #endif
 
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if defined(__has_include)
+# if __has_include(<cxxabi.h>)
+#  define GTEST_HAS_CXXABI_H_ 1
+# else
+#  define GTEST_HAS_CXXABI_H_ 0
+# endif
+#elif defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
+# define GTEST_HAS_CXXABI_H_ 1
+#else
+# define GTEST_HAS_CXXABI_H_ 0
+#endif
+
+// A function level attribute to disable checking for use of uninitialized
+// memory when built with MemorySanitizer.
+#if defined(__clang__)
+# if __has_feature(memory_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
+       __attribute__((no_sanitize_memory))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+# endif  // __has_feature(memory_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __clang__
+
+// A function level attribute to disable AddressSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(address_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+       __attribute__((no_sanitize_address))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+# endif  // __has_feature(address_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable ThreadSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(thread_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
+       __attribute__((no_sanitize_thread))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+# endif  // __has_feature(thread_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __clang__
+
 namespace testing {
 
 class Message;
 
+#if defined(GTEST_TUPLE_NAMESPACE_)
+// Import tuple and friends into the ::testing namespace.
+// It is part of our interface, having them in ::testing allows us to change
+// their types as needed.
+using GTEST_TUPLE_NAMESPACE_::get;
+using GTEST_TUPLE_NAMESPACE_::make_tuple;
+using GTEST_TUPLE_NAMESPACE_::tuple;
+using GTEST_TUPLE_NAMESPACE_::tuple_size;
+using GTEST_TUPLE_NAMESPACE_::tuple_element;
+#endif  // defined(GTEST_TUPLE_NAMESPACE_)
+
 namespace internal {
 
-class String;
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
 
 // The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
 // expression is true. For example, you could use it to verify the
 // size of a static array:
 //
-//   GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
-//                         content_type_names_incorrect_size);
+//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
+//                         names_incorrect_size);
 //
 // or to make sure a struct is smaller than a certain size:
 //
@@ -705,16 +1031,22 @@ class String;
 // the expression is false, most compilers will issue a warning/error
 // containing the name of the variable.
 
+#if GTEST_LANG_CXX11
+# define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
+#else  // !GTEST_LANG_CXX11
 template <bool>
-struct CompileAssert {
+  struct CompileAssert {
 };
 
-#define GTEST_COMPILE_ASSERT_(expr, msg) \
-  typedef ::testing::internal::CompileAssert<(bool(expr))> \
-      msg[bool(expr) ? 1 : -1]
+# define GTEST_COMPILE_ASSERT_(expr, msg) \
+  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
+      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
+#endif  // !GTEST_LANG_CXX11
 
 // Implementation details of GTEST_COMPILE_ASSERT_:
 //
+// (In C++11, we simply use static_assert instead of the following)
+//
 // - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
 //   elements (and thus is invalid) when the expression is false.
 //
@@ -761,7 +1093,12 @@ template <typename T1, typename T2>
 struct StaticAssertTypeEqHelper;
 
 template <typename T>
-struct StaticAssertTypeEqHelper<T, T> {};
+struct StaticAssertTypeEqHelper<T, T> {
+  enum { value = true };
+};
+
+// Evaluates to the number of elements in 'array'.
+#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
 
 #if GTEST_HAS_GLOBAL_STRING
 typedef ::string string;
@@ -809,6 +1146,12 @@ class scoped_ptr {
       ptr_ = p;
     }
   }
+
+  friend void swap(scoped_ptr& a, scoped_ptr& b) {
+    using std::swap;
+    swap(a.ptr_, b.ptr_);
+  }
+
  private:
   T* ptr_;
 
@@ -871,10 +1214,9 @@ class GTEST_API_ RE {
  private:
   void Init(const char* regex);
 
-  // We use a const char* instead of a string, as Google Test may be used
-  // where string is not available.  We also do not use Google Test's own
-  // String type here, in order to simplify dependencies between the
-  // files.
+  // We use a const char* instead of an std::string, as Google Test used to be
+  // used where std::string is not available.  TODO(wan@google.com): change to
+  // std::string.
   const char* pattern_;
   bool is_valid_;
 
@@ -933,13 +1275,18 @@ class GTEST_API_ GTestLog {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
 };
 
-#define GTEST_LOG_(severity) \
+#if !defined(GTEST_LOG_)
+
+# define GTEST_LOG_(severity) \
     ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
                                   __FILE__, __LINE__).GetStream()
 
 inline void LogToStderr() {}
 inline void FlushInfoLog() { fflush(NULL); }
 
+#endif  // !defined(GTEST_LOG_)
+
+#if !defined(GTEST_CHECK_)
 // INTERNAL IMPLEMENTATION - DO NOT USE.
 //
 // GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
@@ -954,12 +1301,13 @@ inline void FlushInfoLog() { fflush(NULL); }
 //    condition itself, plus additional message streamed into it, if any,
 //    and then it aborts the program. It aborts the program irrespective of
 //    whether it is built in the debug mode or not.
-#define GTEST_CHECK_(condition) \
+# define GTEST_CHECK_(condition) \
     GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
     if (::testing::internal::IsTrue(condition)) \
       ; \
     else \
       GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#endif  // !defined(GTEST_CHECK_)
 
 // An all-mode assert to verify that the given POSIX-style function
 // call returns 0 (indicating success).  Known limitation: this
@@ -971,6 +1319,15 @@ inline void FlushInfoLog() { fflush(NULL); }
     GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
                       << gtest_error
 
+#if GTEST_HAS_STD_MOVE_
+using std::move;
+#else  // GTEST_HAS_STD_MOVE_
+template <typename T>
+const T& move(const T& t) {
+  return t;
+}
+#endif  // GTEST_HAS_STD_MOVE_
+
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Use ImplicitCast_ as a safe version of static_cast for upcasting in
@@ -1021,7 +1378,9 @@ inline To DownCast_(From* f) {  // so we only accept pointers
   // for compile-time type checking, and has no overhead in an
   // optimized build at run-time, as it will be optimized away
   // completely.
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
   if (false) {
+  GTEST_INTENTIONAL_CONST_COND_POP_()
     const To to = NULL;
     ::testing::internal::ImplicitCast_<From*>(to);
   }
@@ -1042,6 +1401,11 @@ template <class Derived, class Base>
 Derived* CheckedDowncastToActualType(Base* base) {
 #if GTEST_HAS_RTTI
   GTEST_CHECK_(typeid(*base) == typeid(Derived));
+#endif
+
+#if GTEST_HAS_DOWNCAST_
+  return ::down_cast<Derived*>(base);
+#elif GTEST_HAS_RTTI
   return dynamic_cast<Derived*>(base);  // NOLINT
 #else
   return static_cast<Derived*>(base);  // Poor man's downcast.
@@ -1057,30 +1421,39 @@ Derived* CheckedDowncastToActualType(Base* base) {
 //   GetCapturedStderr - stops capturing stderr and returns the captured string.
 //
 GTEST_API_ void CaptureStdout();
-GTEST_API_ String GetCapturedStdout();
+GTEST_API_ std::string GetCapturedStdout();
 GTEST_API_ void CaptureStderr();
-GTEST_API_ String GetCapturedStderr();
+GTEST_API_ std::string GetCapturedStderr();
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
+// Returns a path to temporary directory.
+GTEST_API_ std::string TempDir();
+
+// Returns the size (in bytes) of a file.
+GTEST_API_ size_t GetFileSize(FILE* file);
+
+// Reads the entire content of a file as a string.
+GTEST_API_ std::string ReadEntireFile(FILE* file);
+
+// All command line arguments.
+GTEST_API_ const ::std::vector<testing::internal::string>& GetArgvs();
 
 #if GTEST_HAS_DEATH_TEST
 
-// A copy of all command line arguments.  Set by InitGoogleTest().
-extern ::std::vector<String> g_argvs;
+const ::std::vector<testing::internal::string>& GetInjectableArgvs();
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
+                             new_argvs);
 
-// GTEST_HAS_DEATH_TEST implies we have ::std::string.
-const ::std::vector<String>& GetArgvs();
 
 #endif  // GTEST_HAS_DEATH_TEST
 
 // Defines synchronization primitives.
-
-#if GTEST_HAS_PTHREAD
-
-// Sleeps for (roughly) n milli-seconds.  This function is only for
-// testing Google Test's own constructs.  Don't use it in user tests,
-// either directly or indirectly.
+#if GTEST_IS_THREADSAFE
+# if GTEST_HAS_PTHREAD
+// Sleeps for (roughly) n milliseconds.  This function is only for testing
+// Google Test's own constructs.  Don't use it in user tests, either
+// directly or indirectly.
 inline void SleepMilliseconds(int n) {
   const timespec time = {
     0,                  // 0 seconds.
@@ -1088,7 +1461,13 @@ inline void SleepMilliseconds(int n) {
   };
   nanosleep(&time, NULL);
 }
+# endif  // GTEST_HAS_PTHREAD
+
+# if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
 
+# elif GTEST_HAS_PTHREAD
 // Allows a controller thread to pause execution of newly created
 // threads until notified.  Instances of this class must be created
 // and destroyed in the controller thread.
@@ -1097,25 +1476,96 @@ inline void SleepMilliseconds(int n) {
 // use it in user tests, either directly or indirectly.
 class Notification {
  public:
-  Notification() : notified_(false) {}
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+  }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
 
   // Notifies all threads created with this notification to start. Must
   // be called from the controller thread.
-  void Notify() { notified_ = true; }
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
 
   // Blocks until the controller thread notifies. Must be called from a test
   // thread.
   void WaitForNotification() {
-    while(!notified_) {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified)
+        break;
       SleepMilliseconds(10);
     }
   }
 
  private:
-  volatile bool notified_;
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+GTEST_API_ void SleepMilliseconds(int n);
+
+// Provides leak-safe Windows kernel handle ownership.
+// Used in death tests and in threading support.
+class GTEST_API_ AutoHandle {
+ public:
+  // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to
+  // avoid including <windows.h> in this header file. Including <windows.h> is
+  // undesirable because it defines a lot of symbols and macros that tend to
+  // conflict with client code. This assumption is verified by
+  // WindowsTypesTest.HANDLEIsVoidStar.
+  typedef void* Handle;
+  AutoHandle();
+  explicit AutoHandle(Handle handle);
+
+  ~AutoHandle();
+
+  Handle Get() const;
+  void Reset();
+  void Reset(Handle handle);
+
+ private:
+  // Returns true iff the handle is a valid handle object that can be closed.
+  bool IsCloseable() const;
+
+  Handle handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class GTEST_API_ Notification {
+ public:
+  Notification();
+  void Notify();
+  void WaitForNotification();
+
+ private:
+  AutoHandle event_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
 };
+# endif  // GTEST_HAS_NOTIFICATION_
+
+// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
+// defined, but we don't want to use MinGW's pthreads implementation, which
+// has conformance problems with some versions of the POSIX standard.
+# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
 
 // As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
 // Consequently, it cannot select a correct instantiation of ThreadWithParam
@@ -1124,7 +1574,7 @@ class Notification {
 // problem.
 class ThreadWithParamBase {
  public:
-  virtual ~ThreadWithParamBase();
+  virtual ~ThreadWithParamBase() {}
   virtual void Run() = 0;
 };
 
@@ -1154,10 +1604,9 @@ extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
 template <typename T>
 class ThreadWithParam : public ThreadWithParamBase {
  public:
-  typedef void (*UserThreadFunc)(T);
+  typedef void UserThreadFunc(T);
 
-  ThreadWithParam(
-      UserThreadFunc func, T param, Notification* thread_can_start)
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
       : func_(func),
         param_(param),
         thread_can_start_(thread_can_start),
@@ -1168,7 +1617,7 @@ class ThreadWithParam : public ThreadWithParamBase {
     GTEST_CHECK_POSIX_SUCCESS_(
         pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
   }
-  ~ThreadWithParam() override { Join(); }
+  ~ThreadWithParam() { Join(); }
 
   void Join() {
     if (!finished_) {
@@ -1177,14 +1626,14 @@ class ThreadWithParam : public ThreadWithParamBase {
     }
   }
 
-  void Run() override {
+  virtual void Run() {
     if (thread_can_start_ != NULL)
       thread_can_start_->WaitForNotification();
     func_(param_);
   }
 
  private:
-  const UserThreadFunc func_;  // User-supplied thread function.
+  UserThreadFunc* const func_;  // User-supplied thread function.
   const T param_;  // User-supplied parameter to the thread function.
   // When non-NULL, used to block execution until the controller thread
   // notifies.
@@ -1194,47 +1643,316 @@ class ThreadWithParam : public ThreadWithParamBase {
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
 };
+# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+// Mutex and ThreadLocal have already been imported into the namespace.
+// Nothing to do here.
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
-// MutexBase and Mutex implement mutex on pthreads-based platforms. They
-// are used in conjunction with class MutexLock:
+// Mutex implements mutex on Windows platforms.  It is used in conjunction
+// with class MutexLock:
 //
 //   Mutex mutex;
 //   ...
-//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the end
-//                            // of the current scope.
-//
-// MutexBase implements behavior for both statically and dynamically
-// allocated mutexes.  Do not use MutexBase directly.  Instead, write
-// the following to define a static mutex:
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the
+//                            // end of the current scope.
 //
+// A static Mutex *must* be defined or declared using one of the following
+// macros:
 //   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// (A non-static Mutex is defined/declared in the usual way).
+class GTEST_API_ Mutex {
+ public:
+  enum MutexType { kStatic = 0, kDynamic = 1 };
+  // We rely on kStaticMutex being 0 as it is to what the linker initializes
+  // type_ in static mutexes.  critical_section_ will be initialized lazily
+  // in ThreadSafeLazyInit().
+  enum StaticConstructorSelector { kStaticMutex = 0 };
+
+  // This constructor intentionally does nothing.  It relies on type_ being
+  // statically initialized to 0 (effectively setting it to kStatic) and on
+  // ThreadSafeLazyInit() to lazily initialize the rest of the members.
+  explicit Mutex(StaticConstructorSelector /*dummy*/) {}
+
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+
+  void Unlock();
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld();
+
+ private:
+  // Initializes owner_thread_id_ and critical_section_ in static mutexes.
+  void ThreadSafeLazyInit();
+
+  // Per http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx,
+  // we assume that 0 is an invalid value for thread IDs.
+  unsigned int owner_thread_id_;
+
+  // For static mutexes, we rely on these members being initialized to zeros
+  // by the linker.
+  MutexType type_;
+  long critical_section_init_phase_;  // NOLINT
+  _RTL_CRITICAL_SECTION* critical_section_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  Mutex* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Base class for ValueHolder<T>.  Allows a caller to hold and delete a value
+// without knowing its type.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Provides a way for a thread to send notifications to a ThreadLocal
+// regardless of its parameter type.
+class ThreadLocalBase {
+ public:
+  // Creates a new ValueHolder<T> object holding a default value passed to
+  // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
+  // responsibility not to call this when the ThreadLocal<T> instance already
+  // has a value on the current thread.
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
+
+ protected:
+  ThreadLocalBase() {}
+  virtual ~ThreadLocalBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+};
+
+// Maps a thread to a set of ThreadLocals that have values instantiated on that
+// thread and notifies them when the thread exits.  A ThreadLocal instance is
+// expected to persist until all threads it has values on have terminated.
+class GTEST_API_ ThreadLocalRegistry {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance);
+
+  // Invoked when a ThreadLocal instance is destroyed.
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance);
+};
+
+class GTEST_API_ ThreadWithParamBase {
+ public:
+  void Join();
+
+ protected:
+  class Runnable {
+   public:
+    virtual ~Runnable() {}
+    virtual void Run() = 0;
+  };
+
+  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  virtual ~ThreadWithParamBase();
+
+ private:
+  AutoHandle thread_;
+};
+
+// Helper class for testing Google Test's multi-threading constructs.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
+  }
+  virtual ~ThreadWithParam() {}
+
+ private:
+  class RunnableImpl : public Runnable {
+   public:
+    RunnableImpl(UserThreadFunc* func, T param)
+        : func_(func),
+          param_(param) {
+    }
+    virtual ~RunnableImpl() {}
+    virtual void Run() {
+      func_(param_);
+    }
+
+   private:
+    UserThreadFunc* const func_;
+    const T param_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+  };
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// Implements thread-local storage on Windows systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
 //
-// You can forward declare a static mutex like this:
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
 //
-//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
 //
-// To create a dynamic mutex, just define an object of type Mutex.
+// The users of a TheadLocal instance have to make sure that all but one
+// threads (including the main one) using that instance have exited before
+// destroying it. Otherwise, the per-thread objects managed for them by the
+// ThreadLocal instance are not guaranteed to be destroyed on all platforms.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal : public ThreadLocalBase {
+ public:
+  ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of T.  Can be deleted via its base class without the caller
+  // knowing the type of T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+
+  T* GetOrCreateValue() const {
+    return static_cast<ValueHolder*>(
+        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+  }
+
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+    return default_factory_->MakeNewHolder();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  scoped_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# elif GTEST_HAS_PTHREAD
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms.
 class MutexBase {
  public:
   // Acquires this mutex.
   void Lock() {
     GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
     owner_ = pthread_self();
+    has_owner_ = true;
   }
 
   // Releases this mutex.
   void Unlock() {
-    // We don't protect writing to owner_ here, as it's the caller's
-    // responsibility to ensure that the current thread holds the
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
     // mutex when this is called.
-    owner_ = 0;
+    has_owner_ = false;
     GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
   }
 
   // Does nothing if the current thread holds the mutex. Otherwise, crashes
   // with high probability.
   void AssertHeld() const {
-    GTEST_CHECK_(owner_ == pthread_self())
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
         << "The current thread is not holding the mutex @" << this;
   }
 
@@ -1245,16 +1963,23 @@ class MutexBase {
   // have to be public.
  public:
   pthread_mutex_t mutex_;  // The underlying pthread mutex.
-  pthread_t owner_;  // The thread holding the mutex; 0 means no one holds it.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
 };
 
 // Forward-declares a static mutex.
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-    extern ::testing::internal::MutexBase mutex
+#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+     extern ::testing::internal::MutexBase mutex
 
 // Defines and statically (i.e. at link time) initializes a static mutex.
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-    ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, 0 }
+#  define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+     ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, pthread_t() }
 
 // The Mutex class can only be used for mutexes created at runtime. It
 // shares its API with MutexBase otherwise.
@@ -1262,7 +1987,7 @@ class Mutex : public MutexBase {
  public:
   Mutex() {
     GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
-    owner_ = 0;
+    has_owner_ = false;
   }
   ~Mutex() {
     GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
@@ -1272,9 +1997,11 @@ class Mutex : public MutexBase {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
 };
 
-// We cannot name this class MutexLock as the ctor declaration would
+// We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
-// platforms.  Hence the typedef trick below.
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
   explicit GTestMutexLock(MutexBase* mutex)
@@ -1298,7 +2025,7 @@ typedef GTestMutexLock MutexLock;
 // ThreadLocalValueHolderBase.
 class ThreadLocalValueHolderBase {
  public:
-  virtual ~ThreadLocalValueHolderBase();
+  virtual ~ThreadLocalValueHolderBase() {}
 };
 
 // Called by pthread to delete thread-local data stored by
@@ -1308,41 +2035,14 @@ extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
 }
 
 // Implements thread-local storage on pthreads-based systems.
-//
-//   // Thread 1
-//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
-//
-//   // Thread 2
-//   tl.set(150);  // Changes the value for thread 2 only.
-//   EXPECT_EQ(150, tl.get());
-//
-//   // Thread 1
-//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
-//   tl.set(200);
-//   EXPECT_EQ(200, tl.get());
-//
-// The template type argument T must have a public copy constructor.
-// In addition, the default ThreadLocal constructor requires T to have
-// a public default constructor.
-//
-// An object managed for a thread by a ThreadLocal instance is deleted
-// when the thread exits.  Or, if the ThreadLocal instance dies in
-// that thread, when the ThreadLocal dies.  It's the user's
-// responsibility to ensure that all other threads using a ThreadLocal
-// have exited when it dies, or the per-thread objects for those
-// threads will not be deleted.
-//
-// Google Test only uses global ThreadLocal objects.  That means they
-// will die after main() has returned.  Therefore, no per-thread
-// object managed by Google Test will be leaked as long as all threads
-// using Google Test have exited when main() returns.
 template <typename T>
 class ThreadLocal {
  public:
-  ThreadLocal() : key_(CreateKey()),
-                  default_() {}
-  explicit ThreadLocal(const T& value) : key_(CreateKey()),
-                                         default_(value) {}
+  ThreadLocal()
+      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : key_(CreateKey()),
+        default_factory_(new InstanceValueHolderFactory(value)) {}
 
   ~ThreadLocal() {
     // Destroys the managed object for the current thread, if any.
@@ -1362,6 +2062,7 @@ class ThreadLocal {
   // Holds a value of type T.
   class ValueHolder : public ThreadLocalValueHolderBase {
    public:
+    ValueHolder() : value_() {}
     explicit ValueHolder(const T& value) : value_(value) {}
 
     T* pointer() { return &value_; }
@@ -1387,22 +2088,54 @@ class ThreadLocal {
       return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
     }
 
-    ValueHolder* const new_holder = new ValueHolder(default_);
+    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
     ThreadLocalValueHolderBase* const holder_base = new_holder;
     GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
     return new_holder->pointer();
   }
 
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
   // A key pthreads uses for looking up per-thread values.
   const pthread_key_t key_;
-  const T default_;  // The default value for each thread.
+  scoped_ptr<ValueHolderFactory> default_factory_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
 };
 
-# define GTEST_IS_THREADSAFE 1
+# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
-#else  // GTEST_HAS_PTHREAD
+#else  // GTEST_IS_THREADSAFE
 
 // A dummy implementation of synchronization primitives (mutex, lock,
 // and thread-local variable).  Necessary for compiling Google Test where
@@ -1412,6 +2145,8 @@ class ThreadLocal {
 class Mutex {
  public:
   Mutex() {}
+  void Lock() {}
+  void Unlock() {}
   void AssertHeld() const {}
 };
 
@@ -1420,6 +2155,11 @@ class Mutex {
 
 # define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
 
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
   explicit GTestMutexLock(Mutex*) {}  // NOLINT
@@ -1440,11 +2180,7 @@ class ThreadLocal {
   T value_;
 };
 
-// The above synchronization primitives have dummy implementations.
-// Therefore Google Test is not thread-safe.
-# define GTEST_IS_THREADSAFE 0
-
-#endif  // GTEST_HAS_PTHREAD
+#endif  // GTEST_IS_THREADSAFE
 
 // Returns the number of threads running in the process, or 0 to indicate that
 // we cannot detect it.
@@ -1542,6 +2278,10 @@ inline bool IsUpper(char ch) {
 inline bool IsXDigit(char ch) {
   return isxdigit(static_cast<unsigned char>(ch)) != 0;
 }
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
 
 inline char ToLower(char ch) {
   return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
@@ -1550,6 +2290,13 @@ inline char ToUpper(char ch) {
   return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
 }
 
+inline std::string StripTrailingSpaces(std::string str) {
+  std::string::iterator it = str.end();
+  while (it != str.begin() && IsSpace(*--it))
+    it = str.erase(it);
+  return str;
+}
+
 // The testing::internal::posix namespace holds wrappers for common
 // POSIX functions.  These wrappers hide the differences between
 // Windows/MSVC and POSIX systems.  Since some compilers define these
@@ -1613,11 +2360,7 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 
 // Functions deprecated by MSVC 8.0.
 
-#ifdef _MSC_VER
-// Temporarily disable warning 4996 (deprecated function).
-# pragma warning(push)
-# pragma warning(disable:4996)
-#endif
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */)
 
 inline const char* StrNCpy(char* dest, const char* src, size_t n) {
   return strncpy(dest, src, n);
@@ -1627,7 +2370,7 @@ inline const char* StrNCpy(char* dest, const char* src, size_t n) {
 // StrError() aren't needed on Windows CE at this time and thus not
 // defined there.
 
-#if !GTEST_OS_WINDOWS_MOBILE
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 inline int ChDir(const char* dir) { return chdir(dir); }
 #endif
 inline FILE* FOpen(const char* path, const char* mode) {
@@ -1651,8 +2394,9 @@ inline int Close(int fd) { return close(fd); }
 inline const char* StrError(int errnum) { return strerror(errnum); }
 #endif
 inline const char* GetEnv(const char* name) {
-#if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE | GTEST_OS_WINDOWS_RT
   // We are on Windows CE, which has no environment variables.
+  static_cast<void>(name);  // To prevent 'unused argument' warning.
   return NULL;
 #elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
   // Environment variables which we programmatically clear will be set to the
@@ -1664,9 +2408,7 @@ inline const char* GetEnv(const char* name) {
 #endif
 }
 
-#ifdef _MSC_VER
-# pragma warning(pop)  // Restores the warning state.
-#endif
+GTEST_DISABLE_MSC_WARNINGS_POP_()
 
 #if GTEST_OS_WINDOWS_MOBILE
 // Windows CE has no C library. The abort() function is used in
@@ -1679,6 +2421,23 @@ inline void Abort() { abort(); }
 
 }  // namespace posix
 
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+# define GTEST_SNPRINTF_(buffer, size, format, ...) \
+     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
+// complain about _snprintf.
+# define GTEST_SNPRINTF_ _snprintf
+#else
+# define GTEST_SNPRINTF_ snprintf
+#endif
+
 // The maximum number a BiggestInt can represent.  This definition
 // works no matter BiggestInt is represented in one's complement or
 // two's complement.
@@ -1731,7 +2490,6 @@ class TypeWithSize<4> {
 template <>
 class TypeWithSize<8> {
  public:
-
 #if GTEST_OS_WINDOWS
   typedef __int64 Int;
   typedef unsigned __int64 UInt;
@@ -1751,14 +2509,23 @@ typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
 // Utilities for command line flags and environment variables.
 
 // Macro for referencing flags.
-#define GTEST_FLAG(name) FLAGS_gtest_##name
+#if !defined(GTEST_FLAG)
+# define GTEST_FLAG(name) FLAGS_gtest_##name
+#endif  // !defined(GTEST_FLAG)
+
+#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+
+#if !defined(GTEST_DECLARE_bool_)
+# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
 
 // Macros for declaring flags.
-#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-#define GTEST_DECLARE_int32_(name) \
+# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+# define GTEST_DECLARE_int32_(name) \
     GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
 #define GTEST_DECLARE_string_(name) \
-    GTEST_API_ extern ::testing::internal::String GTEST_FLAG(name)
+    GTEST_API_ extern ::std::string GTEST_FLAG(name)
 
 // Macros for defining flags.
 #define GTEST_DEFINE_bool_(name, default_val, doc) \
@@ -1766,7 +2533,15 @@ typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
 #define GTEST_DEFINE_int32_(name, default_val, doc) \
     GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
 #define GTEST_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::testing::internal::String GTEST_FLAG(name) = (default_val)
+    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+
+#endif  // !defined(GTEST_DECLARE_bool_)
+
+// Thread annotations
+#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+# define GTEST_LOCK_EXCLUDED_(locks)
+#endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
 
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
 // to *value and returns true; otherwise leaves *value unchanged and returns
@@ -1780,7 +2555,7 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value);
 // corresponding to the given Google Test flag.
 bool BoolFromGTestEnv(const char* flag, bool default_val);
 GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
-const char* StringFromGTestEnv(const char* flag, const char* default_val);
+std::string StringFromGTestEnv(const char* flag, const char* default_val);
 
 }  // namespace internal
 }  // namespace testing
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-string.h b/utils/unittest/googletest/include/gtest/internal/gtest-string.h
index dc3a07be880..97f1a7fdd2c 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-string.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-string.h
@@ -47,50 +47,18 @@
 #endif
 
 #include <string.h>
-#include "gtest/internal/gtest-port.h"
-
 #include <string>
 
+#include "gtest/internal/gtest-port.h"
+
 namespace testing {
 namespace internal {
 
-// String - a UTF-8 string class.
-//
-// For historic reasons, we don't use std::string.
-//
-// TODO(wan@google.com): replace this class with std::string or
-// implement it in terms of the latter.
-//
-// Note that String can represent both NULL and the empty string,
-// while std::string cannot represent NULL.
-//
-// NULL and the empty string are considered different.  NULL is less
-// than anything (including the empty string) except itself.
-//
-// This class only provides minimum functionality necessary for
-// implementing Google Test.  We do not intend to implement a full-fledged
-// string class here.
-//
-// Since the purpose of this class is to provide a substitute for
-// std::string on platforms where it cannot be used, we define a copy
-// constructor and assignment operators such that we don't need
-// conditional compilation in a lot of places.
-//
-// In order to make the representation efficient, the d'tor of String
-// is not virtual.  Therefore DO NOT INHERIT FROM String.
+// String - an abstract class holding static string utilities.
 class GTEST_API_ String {
  public:
   // Static utility methods
 
-  // Returns the input enclosed in double quotes if it's not NULL;
-  // otherwise returns "(null)".  For example, "\"Hello\"" is returned
-  // for input "Hello".
-  //
-  // This is useful for printing a C string in the syntax of a literal.
-  //
-  // Known issue: escape sequences are not handled yet.
-  static String ShowCStringQuoted(const char* c_str);
-
   // Clones a 0-terminated C string, allocating memory using new.  The
   // caller is responsible for deleting the return value using
   // delete[].  Returns the cloned string, or NULL if the input is
@@ -137,11 +105,7 @@ class GTEST_API_ String {
   // NULL will be converted to "(null)".  If an error occurred during
   // the conversion, "(failed to convert from wide string)" is
   // returned.
-  static String ShowWideCString(const wchar_t* wide_c_str);
-
-  // Similar to ShowWideCString(), except that this function encloses
-  // the converted string in double quotes.
-  static String ShowWideCStringQuoted(const wchar_t* wide_c_str);
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
 
   // Compares two wide C strings.  Returns true iff they have the same
   // content.
@@ -175,174 +139,27 @@ class GTEST_API_ String {
   static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
                                                const wchar_t* rhs);
 
-  // Formats a list of arguments to a String, using the same format
-  // spec string as for printf.
-  //
-  // We do not use the StringPrintf class as it is not universally
-  // available.
-  //
-  // The result is limited to 4096 characters (including the tailing
-  // 0).  If 4096 characters are not enough to format the input,
-  // "<buffer exceeded>" is returned.
-  static String Format(const char* format, ...);
-
-  // C'tors
-
-  // The default c'tor constructs a NULL string.
-  String() : c_str_(NULL), length_(0) {}
-
-  // Constructs a String by cloning a 0-terminated C string.
-  String(const char* a_c_str) {  // NOLINT
-    if (a_c_str == NULL) {
-      c_str_ = NULL;
-      length_ = 0;
-    } else {
-      ConstructNonNull(a_c_str, strlen(a_c_str));
-    }
-  }
+  // Returns true iff the given string ends with the given suffix, ignoring
+  // case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(
+      const std::string& str, const std::string& suffix);
 
-  // Constructs a String by copying a given number of chars from a
-  // buffer.  E.g. String("hello", 3) creates the string "hel",
-  // String("a\0bcd", 4) creates "a\0bc", String(NULL, 0) creates "",
-  // and String(NULL, 1) results in access violation.
-  String(const char* buffer, size_t a_length) {
-    ConstructNonNull(buffer, a_length);
-  }
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
 
-  // The copy c'tor creates a new copy of the string.  The two
-  // String objects do not share content.
-  String(const String& str) : c_str_(NULL), length_(0) { *this = str; }
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
 
-  // D'tor.  String is intended to be a final class, so the d'tor
-  // doesn't need to be virtual.
-  ~String() { delete[] c_str_; }
-
-  // Allows a String to be implicitly converted to an ::std::string or
-  // ::string, and vice versa.  Converting a String containing a NULL
-  // pointer to ::std::string or ::string is undefined behavior.
-  // Converting a ::std::string or ::string containing an embedded NUL
-  // character to a String will result in the prefix up to the first
-  // NUL character.
-  String(const ::std::string& str) {
-    ConstructNonNull(str.c_str(), str.length());
-  }
-
-  operator ::std::string() const { return ::std::string(c_str(), length()); }
-
-#if GTEST_HAS_GLOBAL_STRING
-  String(const ::string& str) {
-    ConstructNonNull(str.c_str(), str.length());
-  }
-
-  operator ::string() const { return ::string(c_str(), length()); }
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-  // Returns true iff this is an empty string (i.e. "").
-  bool empty() const { return (c_str() != NULL) && (length() == 0); }
-
-  // Compares this with another String.
-  // Returns < 0 if this is less than rhs, 0 if this is equal to rhs, or > 0
-  // if this is greater than rhs.
-  int Compare(const String& rhs) const;
-
-  // Returns true iff this String equals the given C string.  A NULL
-  // string and a non-NULL string are considered not equal.
-  bool operator==(const char* a_c_str) const { return Compare(a_c_str) == 0; }
-
-  // Returns true iff this String is less than the given String.  A
-  // NULL string is considered less than "".
-  bool operator<(const String& rhs) const { return Compare(rhs) < 0; }
-
-  // Returns true iff this String doesn't equal the given C string.  A NULL
-  // string and a non-NULL string are considered not equal.
-  bool operator!=(const char* a_c_str) const { return !(*this == a_c_str); }
-
-  // Returns true iff this String ends with the given suffix.  *Any*
-  // String is considered to end with a NULL or empty suffix.
-  bool EndsWith(const char* suffix) const;
-
-  // Returns true iff this String ends with the given suffix, not considering
-  // case. Any String is considered to end with a NULL or empty suffix.
-  bool EndsWithCaseInsensitive(const char* suffix) const;
-
-  // Returns the length of the encapsulated string, or 0 if the
-  // string is NULL.
-  size_t length() const { return length_; }
-
-  // Gets the 0-terminated C string this String object represents.
-  // The String object still owns the string.  Therefore the caller
-  // should NOT delete the return value.
-  const char* c_str() const { return c_str_; }
-
-  // Assigns a C string to this object.  Self-assignment works.
-  const String& operator=(const char* a_c_str) {
-    return *this = String(a_c_str);
-  }
-
-  // Assigns a String object to this object.  Self-assignment works.
-  const String& operator=(const String& rhs) {
-    if (this != &rhs) {
-      delete[] c_str_;
-      if (rhs.c_str() == NULL) {
-        c_str_ = NULL;
-        length_ = 0;
-      } else {
-        ConstructNonNull(rhs.c_str(), rhs.length());
-      }
-    }
-
-    return *this;
-  }
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
 
  private:
-  // Constructs a non-NULL String from the given content.  This
-  // function can only be called when c_str_ has not been allocated.
-  // ConstructNonNull(NULL, 0) results in an empty string ("").
-  // ConstructNonNull(NULL, non_zero) is undefined behavior.
-  void ConstructNonNull(const char* buffer, size_t a_length) {
-    char* const str = new char[a_length + 1];
-    memcpy(str, buffer, a_length);
-    str[a_length] = '\0';
-    c_str_ = str;
-    length_ = a_length;
-  }
-
-  const char* c_str_;
-  size_t length_;
+  String();  // Not meant to be instantiated.
 };  // class String
 
-// Streams a String to an ostream.  Each '\0' character in the String
-// is replaced with "\\0".
-inline ::std::ostream& operator<<(::std::ostream& os, const String& str) {
-  if (str.c_str() == NULL) {
-    os << "(null)";
-  } else {
-    const char* const c_str = str.c_str();
-    for (size_t i = 0; i != str.length(); i++) {
-      if (c_str[i] == '\0') {
-        os << "\\0";
-      } else {
-        os << c_str[i];
-      }
-    }
-  }
-  return os;
-}
-
-// Gets the content of the stringstream's buffer as a String.  Each '\0'
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
 // character in the buffer is replaced with "\\0".
-GTEST_API_ String StringStreamToString(::std::stringstream* stream);
-
-// Converts a streamable value to a String.  A NULL pointer is
-// converted to "(null)".  When the input value is a ::string,
-// ::std::string, ::wstring, or ::std::wstring object, each NUL
-// character in it is replaced with "\\0".
-
-// Declared here but defined in gtest.h, so that it has access
-// to the definition of the Message class, required by the ARM
-// compiler.
-template <typename T>
-String StreamableToString(const T& streamable);
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
 
 }  // namespace internal
 }  // namespace testing
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-tuple.h b/utils/unittest/googletest/include/gtest/internal/gtest-tuple.h
index d1af50e1883..e9b405340a8 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-tuple.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-tuple.h
@@ -1,4 +1,6 @@
-// This file was GENERATED by a script.  DO NOT EDIT BY HAND!!!
+// This file was GENERATED by command:
+//     pump.py gtest-tuple.h.pump
+// DO NOT EDIT BY HAND!!!
 
 // Copyright 2009 Google Inc.
 // All Rights Reserved.
@@ -51,6 +53,14 @@
    private:
 #endif
 
+// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict
+// with our own definitions. Therefore using our own tuple does not work on
+// those compilers.
+#if defined(_MSC_VER) && _MSC_VER >= 1600  /* 1600 is Visual Studio 2010 */
+# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \
+GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers."
+#endif
+
 // GTEST_n_TUPLE_(T) is the type of an n-tuple.
 #define GTEST_0_TUPLE_(T) tuple<>
 #define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
@@ -140,34 +150,54 @@ template <bool kIndexValid, int kIndex, class Tuple>
 struct TupleElement;
 
 template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 0, GTEST_10_TUPLE_(T)> { typedef T0 type; };
+struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
+  typedef T0 type;
+};
 
 template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 1, GTEST_10_TUPLE_(T)> { typedef T1 type; };
+struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
+  typedef T1 type;
+};
 
 template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 2, GTEST_10_TUPLE_(T)> { typedef T2 type; };
+struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
+  typedef T2 type;
+};
 
 template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 3, GTEST_10_TUPLE_(T)> { typedef T3 type; };
+struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
+  typedef T3 type;
+};
 
 template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 4, GTEST_10_TUPLE_(T)> { typedef T4 type; };
+struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
+  typedef T4 type;
+};
 
 template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 5, GTEST_10_TUPLE_(T)> { typedef T5 type; };
+struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
+  typedef T5 type;
+};
 
 template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 6, GTEST_10_TUPLE_(T)> { typedef T6 type; };
+struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
+  typedef T6 type;
+};
 
 template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 7, GTEST_10_TUPLE_(T)> { typedef T7 type; };
+struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
+  typedef T7 type;
+};
 
 template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 8, GTEST_10_TUPLE_(T)> { typedef T8 type; };
+struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
+  typedef T8 type;
+};
 
 template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 9, GTEST_10_TUPLE_(T)> { typedef T9 type; };
+struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
+  typedef T9 type;
+};
 
 }  // namespace gtest_internal
 
@@ -708,37 +738,59 @@ inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
 template <typename Tuple> struct tuple_size;
 
 template <GTEST_0_TYPENAMES_(T)>
-struct tuple_size<GTEST_0_TUPLE_(T)> { static const int value = 0; };
+struct tuple_size<GTEST_0_TUPLE_(T) > {
+  static const int value = 0;
+};
 
 template <GTEST_1_TYPENAMES_(T)>
-struct tuple_size<GTEST_1_TUPLE_(T)> { static const int value = 1; };
+struct tuple_size<GTEST_1_TUPLE_(T) > {
+  static const int value = 1;
+};
 
 template <GTEST_2_TYPENAMES_(T)>
-struct tuple_size<GTEST_2_TUPLE_(T)> { static const int value = 2; };
+struct tuple_size<GTEST_2_TUPLE_(T) > {
+  static const int value = 2;
+};
 
 template <GTEST_3_TYPENAMES_(T)>
-struct tuple_size<GTEST_3_TUPLE_(T)> { static const int value = 3; };
+struct tuple_size<GTEST_3_TUPLE_(T) > {
+  static const int value = 3;
+};
 
 template <GTEST_4_TYPENAMES_(T)>
-struct tuple_size<GTEST_4_TUPLE_(T)> { static const int value = 4; };
+struct tuple_size<GTEST_4_TUPLE_(T) > {
+  static const int value = 4;
+};
 
 template <GTEST_5_TYPENAMES_(T)>
-struct tuple_size<GTEST_5_TUPLE_(T)> { static const int value = 5; };
+struct tuple_size<GTEST_5_TUPLE_(T) > {
+  static const int value = 5;
+};
 
 template <GTEST_6_TYPENAMES_(T)>
-struct tuple_size<GTEST_6_TUPLE_(T)> { static const int value = 6; };
+struct tuple_size<GTEST_6_TUPLE_(T) > {
+  static const int value = 6;
+};
 
 template <GTEST_7_TYPENAMES_(T)>
-struct tuple_size<GTEST_7_TUPLE_(T)> { static const int value = 7; };
+struct tuple_size<GTEST_7_TUPLE_(T) > {
+  static const int value = 7;
+};
 
 template <GTEST_8_TYPENAMES_(T)>
-struct tuple_size<GTEST_8_TUPLE_(T)> { static const int value = 8; };
+struct tuple_size<GTEST_8_TUPLE_(T) > {
+  static const int value = 8;
+};
 
 template <GTEST_9_TYPENAMES_(T)>
-struct tuple_size<GTEST_9_TUPLE_(T)> { static const int value = 9; };
+struct tuple_size<GTEST_9_TUPLE_(T) > {
+  static const int value = 9;
+};
 
 template <GTEST_10_TYPENAMES_(T)>
-struct tuple_size<GTEST_10_TUPLE_(T)> { static const int value = 10; };
+struct tuple_size<GTEST_10_TUPLE_(T) > {
+  static const int value = 10;
+};
 
 template <int k, class Tuple>
 struct tuple_element {
@@ -922,8 +974,8 @@ template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
 inline bool operator==(const GTEST_10_TUPLE_(T)& t,
                        const GTEST_10_TUPLE_(U)& u) {
   return gtest_internal::SameSizeTuplePrefixComparator<
-      tuple_size<GTEST_10_TUPLE_(T)>::value,
-      tuple_size<GTEST_10_TUPLE_(U)>::value>::Eq(t, u);
+      tuple_size<GTEST_10_TUPLE_(T) >::value,
+      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
 }
 
 template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-type-util.h b/utils/unittest/googletest/include/gtest/internal/gtest-type-util.h
index b7b01b0948c..e46f7cfcb48 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-type-util.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-type-util.h
@@ -45,15 +45,14 @@
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 
 #include "gtest/internal/gtest-port.h"
-#include "gtest/internal/gtest-string.h"
 
 // #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
 // libstdc++ (which is where cxxabi.h comes from).
-# ifdef __GLIBCXX__
+# if GTEST_HAS_CXXABI_H_
 #  include <cxxabi.h>
 # elif defined(__HP_aCC)
 #  include <acxx_demangle.h>
-# endif  // __GLIBCXX__
+# endif  // GTEST_HASH_CXXABI_H_
 
 namespace testing {
 namespace internal {
@@ -62,24 +61,24 @@ namespace internal {
 // NB: This function is also used in Google Mock, so don't move it inside of
 // the typed-test-only section below.
 template <typename T>
-String GetTypeName() {
+std::string GetTypeName() {
 # if GTEST_HAS_RTTI
 
   const char* const name = typeid(T).name();
-#  if defined(__GLIBCXX__) || defined(__HP_aCC)
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
   int status = 0;
   // gcc's implementation of typeid(T).name() mangles the type name,
   // so we have to demangle it.
-#   ifdef __GLIBCXX__
+#   if GTEST_HAS_CXXABI_H_
   using abi::__cxa_demangle;
-#   endif // __GLIBCXX__
+#   endif  // GTEST_HAS_CXXABI_H_
   char* const readable_name = __cxa_demangle(name, 0, 0, &status);
-  const String name_str(status == 0 ? readable_name : name);
+  const std::string name_str(status == 0 ? readable_name : name);
   free(readable_name);
   return name_str;
 #  else
   return name;
-#  endif  // __GLIBCXX__ || __HP_aCC
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
 
 # else
 
@@ -3300,7 +3299,9 @@ struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
 // INSTANTIATE_TYPED_TEST_CASE_P().
 
 template <typename T>
-struct TypeList { typedef Types1<T> type; };
+struct TypeList {
+  typedef Types1<T> type;
+};
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5,
     typename T6, typename T7, typename T8, typename T9, typename T10,
diff --git a/utils/unittest/googletest/src/gtest-death-test.cc b/utils/unittest/googletest/src/gtest-death-test.cc
index 47c1a15b820..a01a3698308 100644
--- a/utils/unittest/googletest/src/gtest-death-test.cc
+++ b/utils/unittest/googletest/src/gtest-death-test.cc
@@ -33,6 +33,7 @@
 
 #include "gtest/gtest-death-test.h"
 #include "gtest/internal/gtest-port.h"
+#include "gtest/internal/custom/gtest.h"
 
 #if GTEST_HAS_DEATH_TEST
 
@@ -43,6 +44,11 @@
 # include <errno.h>
 # include <fcntl.h>
 # include <limits.h>
+
+# if GTEST_OS_LINUX
+#  include <signal.h>
+# endif  // GTEST_OS_LINUX
+
 # include <stdarg.h>
 
 # if GTEST_OS_WINDOWS
@@ -52,6 +58,10 @@
 #  include <sys/wait.h>
 # endif  // GTEST_OS_WINDOWS
 
+# if GTEST_OS_QNX
+#  include <spawn.h>
+# endif  // GTEST_OS_QNX
+
 #endif  // GTEST_HAS_DEATH_TEST
 
 #include "gtest/gtest-message.h"
@@ -59,9 +69,9 @@
 
 // Indicates that this translation unit is part of Google Test's
 // implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
 #define GTEST_IMPLEMENTATION_ 1
 #include "src/gtest-internal-inl.h"
 #undef GTEST_IMPLEMENTATION_
@@ -100,13 +110,44 @@ GTEST_DEFINE_string_(
     "Indicates the file, line number, temporal index of "
     "the single death test to run, and a file descriptor to "
     "which a success code may be sent, all separated by "
-    "colons.  This flag is specified if and only if the current "
+    "the '|' characters.  This flag is specified if and only if the current "
     "process is a sub-process launched for running a thread-safe "
     "death test.  FOR INTERNAL USE ONLY.");
 }  // namespace internal
 
 #if GTEST_HAS_DEATH_TEST
 
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+# if !GTEST_OS_WINDOWS
+static bool g_in_fast_death_test_child = false;
+# endif
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+# if GTEST_OS_WINDOWS
+
+  // On Windows, death tests are thread-safe regardless of the value of the
+  // death_test_style flag.
+  return !GTEST_FLAG(internal_run_death_test).empty();
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe")
+    return !GTEST_FLAG(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
 // ExitedWithCode constructor.
 ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
 }
@@ -131,6 +172,14 @@ KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
 
 // KilledBySignal function-call operator.
 bool KilledBySignal::operator()(int exit_status) const {
+#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  {
+    bool result;
+    if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
+      return result;
+    }
+  }
+#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
 }
 # endif  // !GTEST_OS_WINDOWS
@@ -141,7 +190,7 @@ namespace internal {
 
 // Generates a textual description of a given exit code, in the format
 // specified by wait(2).
-static String ExitSummary(int exit_code) {
+static std::string ExitSummary(int exit_code) {
   Message m;
 
 # if GTEST_OS_WINDOWS
@@ -176,7 +225,7 @@ bool ExitedUnsuccessfully(int exit_status) {
 // one thread running, or cannot determine the number of threads, prior
 // to executing the given statement.  It is the responsibility of the
 // caller not to pass a thread_count of 1.
-static String DeathTestThreadWarning(size_t thread_count) {
+static std::string DeathTestThreadWarning(size_t thread_count) {
   Message msg;
   msg << "Death tests use fork(), which is unsafe particularly"
       << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
@@ -210,7 +259,7 @@ enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
 // message is propagated back to the parent process.  Otherwise, the
 // message is simply printed to stderr.  In either case, the program
 // then exits with status 1.
-void DeathTestAbort(const String& message) {
+void DeathTestAbort(const std::string& message) {
   // On a POSIX system, this function may be called from a threadsafe-style
   // death test child process, which operates on a very small stack.  Use
   // the heap for any additional non-minuscule memory requirements.
@@ -234,9 +283,10 @@ void DeathTestAbort(const String& message) {
 # define GTEST_DEATH_TEST_CHECK_(expression) \
   do { \
     if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort(::testing::internal::String::Format( \
-          "CHECK failed: File %s, line %d: %s", \
-          __FILE__, __LINE__, #expression)); \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression); \
     } \
   } while (::testing::internal::AlwaysFalse())
 
@@ -254,15 +304,16 @@ void DeathTestAbort(const String& message) {
       gtest_retval = (expression); \
     } while (gtest_retval == -1 && errno == EINTR); \
     if (gtest_retval == -1) { \
-      DeathTestAbort(::testing::internal::String::Format( \
-          "CHECK failed: File %s, line %d: %s != -1", \
-          __FILE__, __LINE__, #expression)); \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression + " != -1"); \
     } \
   } while (::testing::internal::AlwaysFalse())
 
 // Returns the message describing the last system error in errno.
-String GetLastErrnoDescription() {
-    return String(errno == 0 ? "" : posix::StrError(errno));
+std::string GetLastErrnoDescription() {
+    return errno == 0 ? "" : posix::StrError(errno);
 }
 
 // This is called from a death test parent process to read a failure
@@ -300,9 +351,6 @@ DeathTest::DeathTest() {
   }
 }
 
-// Pin the vtable to this file.
-DeathTest::~DeathTest() {}
-
 // Creates and returns a death test by dispatching to the current
 // death test factory.
 bool DeathTest::Create(const char* statement, const RE* regex,
@@ -315,11 +363,11 @@ const char* DeathTest::LastMessage() {
   return last_death_test_message_.c_str();
 }
 
-void DeathTest::set_last_death_test_message(const String& message) {
+void DeathTest::set_last_death_test_message(const std::string& message) {
   last_death_test_message_ = message;
 }
 
-String DeathTest::last_death_test_message_;
+std::string DeathTest::last_death_test_message_;
 
 // Provides cross platform implementation for some death functionality.
 class DeathTestImpl : public DeathTest {
@@ -334,10 +382,10 @@ class DeathTestImpl : public DeathTest {
         write_fd_(-1) {}
 
   // read_fd_ is expected to be closed and cleared by a derived class.
-  ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
 
-  void Abort(AbortReason reason) override;
-  bool Passed(bool status_ok) override;
+  void Abort(AbortReason reason);
+  virtual bool Passed(bool status_ok);
 
   const char* statement() const { return statement_; }
   const RE* regex() const { return regex_; }
@@ -494,7 +542,7 @@ bool DeathTestImpl::Passed(bool status_ok) {
   if (!spawned())
     return false;
 
-  const String error_message = GetCapturedStderr();
+  const std::string error_message = GetCapturedStderr();
 
   bool success = false;
   Message buffer;
@@ -530,6 +578,7 @@ bool DeathTestImpl::Passed(bool status_ok) {
       }
       break;
     case IN_PROGRESS:
+    default:
       GTEST_LOG_(FATAL)
           << "DeathTest::Passed somehow called before conclusion of test";
   }
@@ -675,22 +724,19 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
       FALSE,   // The initial state is non-signalled.
       NULL));  // The even is unnamed.
   GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
-  const String filter_flag = String::Format("--%s%s=%s.%s",
-                                            GTEST_FLAG_PREFIX_, kFilterFlag,
-                                            info->test_case_name(),
-                                            info->name());
-  const String internal_flag = String::Format(
-    "--%s%s=%s|%d|%d|%u|%Iu|%Iu",
-      GTEST_FLAG_PREFIX_,
-      kInternalRunDeathTestFlag,
-      file_, line_,
-      death_test_index,
-      static_cast<unsigned int>(::GetCurrentProcessId()),
-      // size_t has the same with as pointers on both 32-bit and 64-bit
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
+      info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
+      "=" + file_ + "|" + StreamableToString(line_) + "|" +
+      StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
       // Windows platforms.
       // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      reinterpret_cast<size_t>(write_handle),
-      reinterpret_cast<size_t>(event_handle_.Get()));
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
+      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
 
   char executable_path[_MAX_PATH + 1];  // NOLINT
   GTEST_DEATH_TEST_CHECK_(
@@ -698,10 +744,9 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
                                             executable_path,
                                             _MAX_PATH));
 
-  String command_line = String::Format("%s %s \"%s\"",
-                                       ::GetCommandLineA(),
-                                       filter_flag.c_str(),
-                                       internal_flag.c_str());
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
 
   DeathTest::set_last_death_test_message("");
 
@@ -744,7 +789,7 @@ class ForkingDeathTest : public DeathTestImpl {
   ForkingDeathTest(const char* statement, const RE* regex);
 
   // All of these virtual functions are inherited from DeathTest.
-  int Wait() override;
+  virtual int Wait();
 
  protected:
   void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
@@ -780,7 +825,7 @@ class NoExecDeathTest : public ForkingDeathTest {
  public:
   NoExecDeathTest(const char* a_statement, const RE* a_regex) :
       ForkingDeathTest(a_statement, a_regex) { }
-  TestRole AssumeRole() override;
+  virtual TestRole AssumeRole();
 };
 
 // The AssumeRole process for a fork-and-run death test.  It implements a
@@ -818,6 +863,7 @@ DeathTest::TestRole NoExecDeathTest::AssumeRole() {
     // Event forwarding to the listeners of event listener API mush be shut
     // down in death test subprocesses.
     GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
     return EXECUTE_TEST;
   } else {
     GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
@@ -835,9 +881,18 @@ class ExecDeathTest : public ForkingDeathTest {
   ExecDeathTest(const char* a_statement, const RE* a_regex,
                 const char* file, int line) :
       ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
-  TestRole AssumeRole() override;
-
+  virtual TestRole AssumeRole();
  private:
+  static ::std::vector<testing::internal::string>
+  GetArgvsForDeathTestChildProcess() {
+    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
+#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    ::std::vector<testing::internal::string> extra_args =
+        GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
+    args.insert(args.end(), extra_args.begin(), extra_args.end());
+#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    return args;
+  }
   // The name of the file in which the death test is located.
   const char* const file_;
   // The line number on which the death test is located.
@@ -872,6 +927,7 @@ class Arguments {
   char* const* Argv() {
     return &args_[0];
   }
+
  private:
   std::vector<char*> args_;
 };
@@ -897,6 +953,7 @@ extern "C" char** environ;
 inline char** GetEnviron() { return environ; }
 #  endif  // GTEST_OS_MAC
 
+#  if !GTEST_OS_QNX
 // The main function for a threadsafe-style death test child process.
 // This function is called in a clone()-ed process and thus must avoid
 // any potentially unsafe operations like malloc or libc functions.
@@ -911,9 +968,8 @@ static int ExecDeathTestChildMain(void* child_arg) {
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(String::Format("chdir(\"%s\") failed: %s",
-                                  original_dir,
-                                  GetLastErrnoDescription().c_str()));
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
@@ -923,12 +979,12 @@ static int ExecDeathTestChildMain(void* child_arg) {
   // invoke the test program via a valid path that contains at least
   // one path separator.
   execve(args->argv[0], args->argv, GetEnviron());
-  DeathTestAbort(String::Format("execve(%s, ...) in %s failed: %s",
-                                args->argv[0],
-                                original_dir,
-                                GetLastErrnoDescription().c_str()));
+  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " +
+                 GetLastErrnoDescription());
   return EXIT_FAILURE;
 }
+#  endif  // !GTEST_OS_QNX
 
 // Two utility routines that together determine the direction the stack
 // grows.
@@ -939,25 +995,77 @@ static int ExecDeathTestChildMain(void* child_arg) {
 // GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
 // StackLowerThanAddress into StackGrowsDown, which then doesn't give
 // correct answer.
-bool StackLowerThanAddress(const void* ptr) GTEST_NO_INLINE_;
-bool StackLowerThanAddress(const void* ptr) {
+void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
+void StackLowerThanAddress(const void* ptr, bool* result) {
   int dummy;
-  return &dummy < ptr;
+  *result = (&dummy < ptr);
 }
 
+// Make sure AddressSanitizer does not tamper with the stack here.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 bool StackGrowsDown() {
   int dummy;
-  return StackLowerThanAddress(&dummy);
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
 }
 
-// A threadsafe implementation of fork(2) for threadsafe-style death tests
-// that uses clone(2).  It dies with an error message if anything goes
-// wrong.
-static pid_t ExecDeathTestFork(char* const* argv, int close_fd) {
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
   ExecDeathTestArgs args = { argv, close_fd };
   pid_t child_pid = -1;
 
-#  if GTEST_HAS_CLONE
+#  if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
+                                        fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
+  // spawn is a system call.
+  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#  else   // GTEST_OS_QNX
+#   if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
+      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#   endif  // GTEST_OS_LINUX
+
+#   if GTEST_HAS_CLONE
   const bool use_fork = GTEST_FLAG(death_test_use_fork);
 
   if (!use_fork) {
@@ -967,21 +1075,37 @@ static pid_t ExecDeathTestFork(char* const* argv, int close_fd) {
     void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
                              MAP_ANON | MAP_PRIVATE, -1, 0);
     GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
     void* const stack_top =
-        static_cast<char*>(stack) + (stack_grows_down ? stack_size : 0);
+        static_cast<char*>(stack) +
+            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
+        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
 
     child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
 
     GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
   }
-#  else
+#   else
   const bool use_fork = true;
-#  endif  // GTEST_HAS_CLONE
+#   endif  // GTEST_HAS_CLONE
 
   if (use_fork && (child_pid = fork()) == 0) {
       ExecDeathTestChildMain(&args);
       _exit(0);
   }
+#  endif  // GTEST_OS_QNX
+#  if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, NULL));
+#  endif  // GTEST_OS_LINUX
 
   GTEST_DEATH_TEST_CHECK_(child_pid != -1);
   return child_pid;
@@ -1009,16 +1133,16 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   // it be closed when the child process does an exec:
   GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
 
-  const String filter_flag =
-      String::Format("--%s%s=%s.%s",
-                     GTEST_FLAG_PREFIX_, kFilterFlag,
-                     info->test_case_name(), info->name());
-  const String internal_flag =
-      String::Format("--%s%s=%s|%d|%d|%d",
-                     GTEST_FLAG_PREFIX_, kInternalRunDeathTestFlag,
-                     file_, line_, death_test_index, pipe_fd[1]);
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|" + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index) + "|"
+      + StreamableToString(pipe_fd[1]);
   Arguments args;
-  args.AddArguments(GetArgvs());
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
   args.AddArgument(filter_flag.c_str());
   args.AddArgument(internal_flag.c_str());
 
@@ -1029,7 +1153,7 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   // is necessary.
   FlushInfoLog();
 
-  const pid_t child_pid = ExecDeathTestFork(args.Argv(), pipe_fd[0]);
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
   set_child_pid(child_pid);
   set_read_fd(pipe_fd[0]);
@@ -1055,9 +1179,10 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
 
   if (flag != NULL) {
     if (death_test_index > flag->index()) {
-      DeathTest::set_last_death_test_message(String::Format(
-          "Death test count (%d) somehow exceeded expected maximum (%d)",
-          death_test_index, flag->index()));
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index)
+          + ") somehow exceeded expected maximum ("
+          + StreamableToString(flag->index()) + ")");
       return false;
     }
 
@@ -1086,38 +1211,15 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
 # endif  // GTEST_OS_WINDOWS
 
   else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(String::Format(
-        "Unknown death test style \"%s\" encountered",
-        GTEST_FLAG(death_test_style).c_str()));
+    DeathTest::set_last_death_test_message(
+        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
+        + "\" encountered");
     return false;
   }
 
   return true;
 }
 
-// Pin the vtable to this file.
-DeathTestFactory::~DeathTestFactory() {}
-
-// Splits a given string on a given delimiter, populating a given
-// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
-// ::std::string, so we can use it here.
-static void SplitString(const ::std::string& str, char delimiter,
-                        ::std::vector< ::std::string>* dest) {
-  ::std::vector< ::std::string> parsed;
-  ::std::string::size_type pos = 0;
-  while (::testing::internal::AlwaysTrue()) {
-    const ::std::string::size_type colon = str.find(delimiter, pos);
-    if (colon == ::std::string::npos) {
-      parsed.push_back(str.substr(pos));
-      break;
-    } else {
-      parsed.push_back(str.substr(pos, colon - pos));
-      pos = colon + 1;
-    }
-  }
-  dest->swap(parsed);
-}
-
 # if GTEST_OS_WINDOWS
 // Recreates the pipe and event handles from the provided parameters,
 // signals the event, and returns a file descriptor wrapped around the pipe
@@ -1129,8 +1231,8 @@ int GetStatusFileDescriptor(unsigned int parent_process_id,
                                                    FALSE,  // Non-inheritable.
                                                    parent_process_id));
   if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
-    DeathTestAbort(String::Format("Unable to open parent process %u",
-                                  parent_process_id));
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
   }
 
   // TODO(vladl@google.com): Replace the following check with a
@@ -1150,9 +1252,10 @@ int GetStatusFileDescriptor(unsigned int parent_process_id,
                                  // DUPLICATE_SAME_ACCESS is used.
                          FALSE,  // Request non-inheritable handler.
                          DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort(String::Format(
-        "Unable to duplicate the pipe handle %Iu from the parent process %u",
-        write_handle_as_size_t, parent_process_id));
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
   }
 
   const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
@@ -1163,17 +1266,18 @@ int GetStatusFileDescriptor(unsigned int parent_process_id,
                          0x0,
                          FALSE,
                          DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort(String::Format(
-        "Unable to duplicate the event handle %Iu from the parent process %u",
-        event_handle_as_size_t, parent_process_id));
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
   }
 
   const int write_fd =
       ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
   if (write_fd == -1) {
-    DeathTestAbort(String::Format(
-        "Unable to convert pipe handle %Iu to a file descriptor",
-        write_handle_as_size_t));
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
   }
 
   // Signals the parent that the write end of the pipe has been acquired
@@ -1210,9 +1314,8 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
       || !ParseNaturalNumber(fields[3], &parent_process_id)
       || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
       || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
-    DeathTestAbort(String::Format(
-        "Bad --gtest_internal_run_death_test flag: %s",
-        GTEST_FLAG(internal_run_death_test).c_str()));
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
   }
   write_fd = GetStatusFileDescriptor(parent_process_id,
                                      write_handle_as_size_t,
@@ -1223,9 +1326,8 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
       || !ParseNaturalNumber(fields[1], &line)
       || !ParseNaturalNumber(fields[2], &index)
       || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort(String::Format(
-        "Bad --gtest_internal_run_death_test flag: %s",
-        GTEST_FLAG(internal_run_death_test).c_str()));
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
   }
 
 # endif  // GTEST_OS_WINDOWS
diff --git a/utils/unittest/googletest/src/gtest-filepath.cc b/utils/unittest/googletest/src/gtest-filepath.cc
index ad1bab8e9f5..0292dc11957 100644
--- a/utils/unittest/googletest/src/gtest-filepath.cc
+++ b/utils/unittest/googletest/src/gtest-filepath.cc
@@ -29,6 +29,7 @@
 //
 // Authors: keith.ray@gmail.com (Keith Ray)
 
+#include "gtest/gtest-message.h"
 #include "gtest/internal/gtest-filepath.h"
 #include "gtest/internal/gtest-port.h"
 
@@ -39,8 +40,8 @@
 #elif GTEST_OS_WINDOWS
 # include <direct.h>
 # include <io.h>
-#elif GTEST_OS_SYMBIAN || GTEST_OS_NACL
-// Symbian OpenC and NaCl have PATH_MAX in sys/syslimits.h
+#elif GTEST_OS_SYMBIAN
+// Symbian OpenC has PATH_MAX in sys/syslimits.h
 # include <sys/syslimits.h>
 #else
 # include <limits.h>
@@ -96,7 +97,7 @@ static bool IsPathSeparator(char c) {
 
 // Returns the current working directory, or "" if unsuccessful.
 FilePath FilePath::GetCurrentDir() {
-#if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
   // Windows CE doesn't have a current directory, so we just return
   // something reasonable.
   return FilePath(kCurrentDirectoryString);
@@ -105,7 +106,14 @@ FilePath FilePath::GetCurrentDir() {
   return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
 #else
   char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+  char* result = getcwd(cwd, sizeof(cwd));
+# if GTEST_OS_NACL
+  // getcwd will likely fail in NaCl due to the sandbox, so return something
+  // reasonable. The user may have provided a shim implementation for getcwd,
+  // however, so fallback only when failure is detected.
+  return FilePath(result == NULL ? kCurrentDirectoryString : cwd);
+# endif  // GTEST_OS_NACL
+  return FilePath(result == NULL ? "" : cwd);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
 
@@ -114,14 +122,15 @@ FilePath FilePath::GetCurrentDir() {
 // FilePath("dir/file"). If a case-insensitive extension is not
 // found, returns a copy of the original FilePath.
 FilePath FilePath::RemoveExtension(const char* extension) const {
-  String dot_extension(String::Format(".%s", extension));
-  if (pathname_.EndsWithCaseInsensitive(dot_extension.c_str())) {
-    return FilePath(String(pathname_.c_str(), pathname_.length() - 4));
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(pathname_.substr(
+        0, pathname_.length() - dot_extension.length()));
   }
   return *this;
 }
 
-// Returns a pointer to the last occurrence of a valid path separator in
+// Returns a pointer to the last occurence of a valid path separator in
 // the FilePath. On Windows, for example, both '/' and '\' are valid path
 // separators. Returns NULL if no path separator was found.
 const char* FilePath::FindLastPathSeparator() const {
@@ -145,7 +154,7 @@ const char* FilePath::FindLastPathSeparator() const {
 // On Windows platform, '\' is the path separator, otherwise it is '/'.
 FilePath FilePath::RemoveDirectoryName() const {
   const char* const last_sep = FindLastPathSeparator();
-  return last_sep ? FilePath(String(last_sep + 1)) : *this;
+  return last_sep ? FilePath(last_sep + 1) : *this;
 }
 
 // RemoveFileName returns the directory path with the filename removed.
@@ -156,9 +165,9 @@ FilePath FilePath::RemoveDirectoryName() const {
 // On Windows platform, '\' is the path separator, otherwise it is '/'.
 FilePath FilePath::RemoveFileName() const {
   const char* const last_sep = FindLastPathSeparator();
-  String dir;
+  std::string dir;
   if (last_sep) {
-    dir = String(c_str(), last_sep + 1 - c_str());
+    dir = std::string(c_str(), last_sep + 1 - c_str());
   } else {
     dir = kCurrentDirectoryString;
   }
@@ -175,11 +184,12 @@ FilePath FilePath::MakeFileName(const FilePath& directory,
                                 const FilePath& base_name,
                                 int number,
                                 const char* extension) {
-  String file;
+  std::string file;
   if (number == 0) {
-    file = String::Format("%s.%s", base_name.c_str(), extension);
+    file = base_name.string() + "." + extension;
   } else {
-    file = String::Format("%s_%d.%s", base_name.c_str(), number, extension);
+    file = base_name.string() + "_" + StreamableToString(number)
+        + "." + extension;
   }
   return ConcatPaths(directory, FilePath(file));
 }
@@ -191,8 +201,7 @@ FilePath FilePath::ConcatPaths(const FilePath& directory,
   if (directory.IsEmpty())
     return relative_path;
   const FilePath dir(directory.RemoveTrailingPathSeparator());
-  return FilePath(String::Format("%s%c%s", dir.c_str(), kPathSeparator,
-                                 relative_path.c_str()));
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
 }
 
 // Returns true if pathname describes something findable in the file-system,
@@ -336,7 +345,7 @@ bool FilePath::CreateFolder() const {
 // On Windows platform, uses \ as the separator, other platforms use /.
 FilePath FilePath::RemoveTrailingPathSeparator() const {
   return IsDirectory()
-      ? FilePath(String(pathname_.c_str(), pathname_.length() - 1))
+      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
       : *this;
 }
 
diff --git a/utils/unittest/googletest/src/gtest-internal-inl.h b/utils/unittest/googletest/src/gtest-internal-inl.h
index f610cfd17b8..ed8a682a964 100644
--- a/utils/unittest/googletest/src/gtest-internal-inl.h
+++ b/utils/unittest/googletest/src/gtest-internal-inl.h
@@ -40,7 +40,7 @@
 // GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
 // part of Google Test's implementation; otherwise it's undefined.
 #if !GTEST_IMPLEMENTATION_
-// A user is trying to include this from his code - just say no.
+// If this file is included from the user's code, just say no.
 # error "gtest-internal-inl.h is part of Google Test's internal implementation."
 # error "It must not be included except by Google Test itself."
 #endif  // GTEST_IMPLEMENTATION_
@@ -58,6 +58,11 @@
 
 #include "gtest/internal/gtest-port.h"
 
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
 #if GTEST_OS_WINDOWS
 # include <windows.h>  // NOLINT
 #endif  // GTEST_OS_WINDOWS
@@ -95,6 +100,7 @@ const char kShuffleFlag[] = "shuffle";
 const char kStackTraceDepthFlag[] = "stack_trace_depth";
 const char kStreamResultToFlag[] = "stream_result_to";
 const char kThrowOnFailureFlag[] = "throw_on_failure";
+const char kFlagfileFlag[] = "flagfile";
 
 // A valid random seed must be in [1, kMaxRandomSeed].
 const int kMaxRandomSeed = 99999;
@@ -112,6 +118,12 @@ GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
 // Formats the given time in milliseconds as seconds.
 GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
 
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
 // Parses a string for an Int32 flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
@@ -151,79 +163,74 @@ class GTestFlagSaver {
  public:
   // The c'tor.
   GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
     color_ = GTEST_FLAG(color);
     death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
     filter_ = GTEST_FLAG(filter);
     internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
     output_ = GTEST_FLAG(output);
-    stream_result_to_ = GTEST_FLAG(stream_result_to);
-
+    print_time_ = GTEST_FLAG(print_time);
     random_seed_ = GTEST_FLAG(random_seed);
     repeat_ = GTEST_FLAG(repeat);
-    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
-
-    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
-    break_on_failure_ = GTEST_FLAG(break_on_failure);
-    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
-    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
-    list_tests_ = GTEST_FLAG(list_tests);
-    print_time_ = GTEST_FLAG(print_time);
     shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
     throw_on_failure_ = GTEST_FLAG(throw_on_failure);
   }
 
   // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
   ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
     GTEST_FLAG(color) = color_;
     GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
     GTEST_FLAG(filter) = filter_;
     GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
     GTEST_FLAG(output) = output_;
-    GTEST_FLAG(stream_result_to) = stream_result_to_;
-
+    GTEST_FLAG(print_time) = print_time_;
     GTEST_FLAG(random_seed) = random_seed_;
     GTEST_FLAG(repeat) = repeat_;
-    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
-
-    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
-    GTEST_FLAG(break_on_failure) = break_on_failure_;
-    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
-    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
-    GTEST_FLAG(list_tests) = list_tests_;
-    GTEST_FLAG(print_time) = print_time_;
     GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
     GTEST_FLAG(throw_on_failure) = throw_on_failure_;
   }
+
  private:
   // Fields for saving the original values of flags.
-  String color_;
-  String death_test_style_;
-  String filter_;
-  String internal_run_death_test_;
-  String output_;
-  String stream_result_to_;
-  internal::Int32 random_seed_;
-  internal::Int32 repeat_;
-  internal::Int32 stack_trace_depth_;
   bool also_run_disabled_tests_;
   bool break_on_failure_;
   bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
   bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
   bool list_tests_;
+  std::string output_;
   bool print_time_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
   bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  std::string stream_result_to_;
   bool throw_on_failure_;
 } GTEST_ATTRIBUTE_UNUSED_;
 
 // Converts a Unicode code point to a narrow string in UTF-8 encoding.
 // code_point parameter is of type UInt32 because wchar_t may not be
 // wide enough to contain a code point.
-// The output buffer str must containt at least 32 characters.
-// The function returns the address of the output buffer.
 // If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'.
-GTEST_API_ char* CodePointToUtf8(UInt32 code_point, char* str);
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
 
 // Converts a wide string to a narrow string in UTF-8 encoding.
 // The wide string is assumed to have the following encoding:
@@ -238,7 +245,7 @@ GTEST_API_ char* CodePointToUtf8(UInt32 code_point, char* str);
 // as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
 // and contains invalid UTF-16 surrogate pairs, values in those pairs
 // will be encoded as individual Unicode characters from Basic Normal Plane.
-GTEST_API_ String WideStringToUtf8(const wchar_t* str, int num_chars);
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
 
 // Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
 // if the variable is present. If a file already exists at this location, this
@@ -342,16 +349,15 @@ class TestPropertyKeyIs {
   // Constructor.
   //
   // TestPropertyKeyIs has NO default constructor.
-  explicit TestPropertyKeyIs(const char* key)
-      : key_(key) {}
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
 
   // Returns true iff the test name of test property matches on key_.
   bool operator()(const TestProperty& test_property) const {
-    return String(test_property.key()).Compare(key_) == 0;
+    return test_property.key() == key_;
   }
 
  private:
-  String key_;
+  std::string key_;
 };
 
 // Class UnitTestOptions.
@@ -369,12 +375,12 @@ class GTEST_API_ UnitTestOptions {
   // Functions for processing the gtest_output flag.
 
   // Returns the output format, or "" for normal printed output.
-  static String GetOutputFormat();
+  static std::string GetOutputFormat();
 
   // Returns the absolute path of the requested output file, or the
   // default (test_detail.xml in the original working directory) if
   // none was explicitly specified.
-  static String GetAbsolutePathToOutputFile();
+  static std::string GetAbsolutePathToOutputFile();
 
   // Functions for processing the gtest_filter flag.
 
@@ -387,8 +393,8 @@ class GTEST_API_ UnitTestOptions {
 
   // Returns true iff the user-specified filter matches the test case
   // name and the test name.
-  static bool FilterMatchesTest(const String &test_case_name,
-                                const String &test_name);
+  static bool FilterMatchesTest(const std::string &test_case_name,
+                                const std::string &test_name);
 
 #if GTEST_OS_WINDOWS
   // Function for supporting the gtest_catch_exception flag.
@@ -401,7 +407,7 @@ class GTEST_API_ UnitTestOptions {
 
   // Returns true if "name" matches the ':' separated list of glob-style
   // filters in "filter".
-  static bool MatchesFilter(const String& name, const char* filter);
+  static bool MatchesFilter(const std::string& name, const char* filter);
 };
 
 // Returns the current application's name, removing directory path if that
@@ -412,21 +418,25 @@ GTEST_API_ FilePath GetCurrentExecutableName();
 class OsStackTraceGetterInterface {
  public:
   OsStackTraceGetterInterface() {}
-  virtual ~OsStackTraceGetterInterface();
+  virtual ~OsStackTraceGetterInterface() {}
 
-  // Returns the current OS stack trace as a String.  Parameters:
+  // Returns the current OS stack trace as an std::string.  Parameters:
   //
   //   max_depth  - the maximum number of stack frames to be included
   //                in the trace.
   //   skip_count - the number of top frames to be skipped; doesn't count
   //                against max_depth.
-  virtual String CurrentStackTrace(int max_depth, int skip_count) = 0;
+  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
 
   // UponLeavingGTest() should be called immediately before Google Test calls
   // user code. It saves some information about the current stack that
   // CurrentStackTrace() will use to find and hide Google Test stack frames.
   virtual void UponLeavingGTest() = 0;
 
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
  private:
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
 };
@@ -434,23 +444,12 @@ class OsStackTraceGetterInterface {
 // A working implementation of the OsStackTraceGetterInterface interface.
 class OsStackTraceGetter : public OsStackTraceGetterInterface {
  public:
-  OsStackTraceGetter() : caller_frame_(NULL) {}
-  String CurrentStackTrace(int max_depth, int skip_count) override;
-  void UponLeavingGTest() override;
+  OsStackTraceGetter() {}
 
-  // This string is inserted in place of stack frames that are part of
-  // Google Test's implementation.
-  static const char* const kElidedFramesMarker;
+  virtual string CurrentStackTrace(int max_depth, int skip_count);
+  virtual void UponLeavingGTest();
 
  private:
-  Mutex mutex_;  // protects all internal state
-
-  // We save the stack frame below the frame that calls user code.
-  // We do this because the address of the frame immediately below
-  // the user code changes between the call to UponLeavingGTest()
-  // and any calls to CurrentStackTrace() from within the user code.
-  void* caller_frame_;
-
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
 };
 
@@ -458,7 +457,7 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface {
 struct TraceInfo {
   const char* file;
   int line;
-  String message;
+  std::string message;
 };
 
 // This is the default global test part result reporter used in UnitTestImpl.
@@ -469,7 +468,7 @@ class DefaultGlobalTestPartResultReporter
   explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
   // Implements the TestPartResultReporterInterface. Reports the test part
   // result in the current test.
-  void ReportTestPartResult(const TestPartResult &result) override;
+  virtual void ReportTestPartResult(const TestPartResult& result);
 
  private:
   UnitTestImpl* const unit_test_;
@@ -485,7 +484,7 @@ class DefaultPerThreadTestPartResultReporter
   explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
   // Implements the TestPartResultReporterInterface. The implementation just
   // delegates to the current global test part result reporter of *unit_test_.
-  void ReportTestPartResult(const TestPartResult &result) override;
+  virtual void ReportTestPartResult(const TestPartResult& result);
 
  private:
   UnitTestImpl* const unit_test_;
@@ -542,15 +541,25 @@ class GTEST_API_ UnitTestImpl {
   // Gets the number of failed tests.
   int failed_test_count() const;
 
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
   // Gets the number of disabled tests.
   int disabled_test_count() const;
 
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
   // Gets the number of all tests.
   int total_test_count() const;
 
   // Gets the number of tests that should run.
   int test_to_run_count() const;
 
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
   // Gets the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const { return elapsed_time_; }
 
@@ -599,7 +608,7 @@ class GTEST_API_ UnitTestImpl {
   // getter, and returns it.
   OsStackTraceGetterInterface* os_stack_trace_getter();
 
-  // Returns the current OS stack trace as a String.
+  // Returns the current OS stack trace as an std::string.
   //
   // The maximum number of stack frames to be included is specified by
   // the gtest_stack_trace_depth flag.  The skip_count parameter
@@ -609,7 +618,7 @@ class GTEST_API_ UnitTestImpl {
   // For example, if Foo() calls Bar(), which in turn calls
   // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
   // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-  String CurrentOsStackTraceExceptTop(int skip_count);
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
 
   // Finds and returns a TestCase with the given name.  If one doesn't
   // exist, creates one and returns it.
@@ -699,6 +708,12 @@ class GTEST_API_ UnitTestImpl {
     ad_hoc_test_result_.Clear();
   }
 
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test case, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
   enum ReactionToSharding {
     HONOR_SHARDING_PROTOCOL,
     IGNORE_SHARDING_PROTOCOL
@@ -883,6 +898,10 @@ class GTEST_API_ UnitTestImpl {
   // Our random number generator.
   internal::Random random_;
 
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
   // How long the test took to run, in milliseconds.
   TimeInMillis elapsed_time_;
 
@@ -938,33 +957,7 @@ GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
 
 // Returns the message describing the last system error, regardless of the
 // platform.
-GTEST_API_ String GetLastErrnoDescription();
-
-# if GTEST_OS_WINDOWS
-// Provides leak-safe Windows kernel handle ownership.
-class AutoHandle {
- public:
-  AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
-  explicit AutoHandle(HANDLE handle) : handle_(handle) {}
-
-  ~AutoHandle() { Reset(); }
-
-  HANDLE Get() const { return handle_; }
-  void Reset() { Reset(INVALID_HANDLE_VALUE); }
-  void Reset(HANDLE handle) {
-    if (handle != handle_) {
-      if (handle_ != INVALID_HANDLE_VALUE)
-        ::CloseHandle(handle_);
-      handle_ = handle;
-    }
-  }
-
- private:
-  HANDLE handle_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
-};
-# endif  // GTEST_OS_WINDOWS
+GTEST_API_ std::string GetLastErrnoDescription();
 
 // Attempts to parse a string into a positive integer pointed to by the
 // number parameter.  Returns true if that is possible.
@@ -1021,8 +1014,9 @@ bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
 class TestResultAccessor {
  public:
   static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
                              const TestProperty& property) {
-    test_result->RecordProperty(property);
+    test_result->RecordProperty(xml_element, property);
   }
 
   static void ClearTestPartResults(TestResult* test_result) {
@@ -1035,6 +1029,154 @@ class TestResultAccessor {
   }
 };
 
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class GTEST_API_ StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const string& message) {
+      Send(message + "\n");
+    }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const string& host, const string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    virtual ~SocketWriter() {
+      if (sockfd_ != -1)
+        CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const int len = static_cast<int>(message.length());
+      if (write(sockfd_, message.c_str(), len) != len) {
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const string host_name_;
+    const string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static string UrlEncode(const char* str);
+
+  StreamingListener(const string& host, const string& port)
+      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
+  }
+
+  void OnTestCaseStart(const TestCase& test_case) {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  void OnTestCaseEnd(const TestCase& test_case) {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
+           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
+           + "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == NULL)
+      file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const scoped_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
 }  // namespace internal
 }  // namespace testing
 
diff --git a/utils/unittest/googletest/src/gtest-port.cc b/utils/unittest/googletest/src/gtest-port.cc
index 94fc57f89b3..e5bf3dd2be4 100644
--- a/utils/unittest/googletest/src/gtest-port.cc
+++ b/utils/unittest/googletest/src/gtest-port.cc
@@ -35,15 +35,16 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <fstream>
 
-#if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>  // For TerminateProcess()
-#elif GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
+# include <windows.h>
 # include <io.h>
 # include <sys/stat.h>
+# include <map>  // Used in ThreadLocal.
 #else
 # include <unistd.h>
-#endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
 # include <mach/mach_init.h>
@@ -51,6 +52,17 @@
 # include <mach/vm_map.h>
 #endif  // GTEST_OS_MAC
 
+#if GTEST_OS_QNX
+# include <devctl.h>
+# include <fcntl.h>
+# include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_AIX
+# include <procinfo.h>
+# include <sys/types.h>
+#endif  // GTEST_OS_AIX
+
 #include "gtest/gtest-spi.h"
 #include "gtest/gtest-message.h"
 #include "gtest/internal/gtest-internal.h"
@@ -58,9 +70,9 @@
 
 // Indicates that this translation unit is part of Google Test's
 // implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
 #define GTEST_IMPLEMENTATION_ 1
 #include "src/gtest-internal-inl.h"
 #undef GTEST_IMPLEMENTATION_
@@ -77,10 +89,31 @@ const int kStdOutFileno = STDOUT_FILENO;
 const int kStdErrFileno = STDERR_FILENO;
 #endif  // _MSC_VER
 
-#if GTEST_OS_MAC
+#if GTEST_OS_LINUX
+
+namespace {
+template <typename T>
+T ReadProcFileField(const string& filename, int field) {
+  std::string dummy;
+  std::ifstream file(filename.c_str());
+  while (field-- > 0) {
+    file >> dummy;
+  }
+  T output = 0;
+  file >> output;
+  return output;
+}
+}  // namespace
+
+// Returns the number of active threads, or 0 when there is an error.
+size_t GetThreadCount() {
+  const string filename =
+      (Message() << "/proc/" << getpid() << "/stat").GetString();
+  return ReadProcFileField<int>(filename, 19);
+}
+
+#elif GTEST_OS_MAC
 
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
 size_t GetThreadCount() {
   const task_t task = mach_task_self();
   mach_msg_type_number_t thread_count;
@@ -98,6 +131,39 @@ size_t GetThreadCount() {
   }
 }
 
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_AIX
+
+size_t GetThreadCount() {
+  struct procentry64 entry;
+  pid_t pid = getpid();
+  int status = getprocs64(&entry, sizeof(entry), NULL, 0, &pid, 1);
+  if (status == 1) {
+    return entry.pi_thcount;
+  } else {
+    return 0;
+  }
+}
+
 #else
 
 size_t GetThreadCount() {
@@ -106,7 +172,390 @@ size_t GetThreadCount() {
   return 0;
 }
 
-#endif  // GTEST_OS_MAC
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+void SleepMilliseconds(int n) {
+  ::Sleep(n);
+}
+
+AutoHandle::AutoHandle()
+    : handle_(INVALID_HANDLE_VALUE) {}
+
+AutoHandle::AutoHandle(Handle handle)
+    : handle_(handle) {}
+
+AutoHandle::~AutoHandle() {
+  Reset();
+}
+
+AutoHandle::Handle AutoHandle::Get() const {
+  return handle_;
+}
+
+void AutoHandle::Reset() {
+  Reset(INVALID_HANDLE_VALUE);
+}
+
+void AutoHandle::Reset(HANDLE handle) {
+  // Resetting with the same handle we already own is invalid.
+  if (handle_ != handle) {
+    if (IsCloseable()) {
+      ::CloseHandle(handle_);
+    }
+    handle_ = handle;
+  } else {
+    GTEST_CHECK_(!IsCloseable())
+        << "Resetting a valid handle to itself is likely a programmer error "
+            "and thus not allowed.";
+  }
+}
+
+bool AutoHandle::IsCloseable() const {
+  // Different Windows APIs may use either of these values to represent an
+  // invalid handle.
+  return handle_ != NULL && handle_ != INVALID_HANDLE_VALUE;
+}
+
+Notification::Notification()
+    : event_(::CreateEvent(NULL,   // Default security attributes.
+                           TRUE,   // Do not reset automatically.
+                           FALSE,  // Initially unset.
+                           NULL)) {  // Anonymous event.
+  GTEST_CHECK_(event_.Get() != NULL);
+}
+
+void Notification::Notify() {
+  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
+}
+
+void Notification::WaitForNotification() {
+  GTEST_CHECK_(
+      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
+}
+
+Mutex::Mutex()
+    : owner_thread_id_(0),
+      type_(kDynamic),
+      critical_section_init_phase_(0),
+      critical_section_(new CRITICAL_SECTION) {
+  ::InitializeCriticalSection(critical_section_);
+}
+
+Mutex::~Mutex() {
+  // Static mutexes are leaked intentionally. It is not thread-safe to try
+  // to clean them up.
+  // TODO(yukawa): Switch to Slim Reader/Writer (SRW) Locks, which requires
+  // nothing to clean it up but is available only on Vista and later.
+  // http://msdn.microsoft.com/en-us/library/windows/desktop/aa904937.aspx
+  if (type_ == kDynamic) {
+    ::DeleteCriticalSection(critical_section_);
+    delete critical_section_;
+    critical_section_ = NULL;
+  }
+}
+
+void Mutex::Lock() {
+  ThreadSafeLazyInit();
+  ::EnterCriticalSection(critical_section_);
+  owner_thread_id_ = ::GetCurrentThreadId();
+}
+
+void Mutex::Unlock() {
+  ThreadSafeLazyInit();
+  // We don't protect writing to owner_thread_id_ here, as it's the
+  // caller's responsibility to ensure that the current thread holds the
+  // mutex when this is called.
+  owner_thread_id_ = 0;
+  ::LeaveCriticalSection(critical_section_);
+}
+
+// Does nothing if the current thread holds the mutex. Otherwise, crashes
+// with high probability.
+void Mutex::AssertHeld() {
+  ThreadSafeLazyInit();
+  GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId())
+      << "The current thread is not holding the mutex @" << this;
+}
+
+// Initializes owner_thread_id_ and critical_section_ in static mutexes.
+void Mutex::ThreadSafeLazyInit() {
+  // Dynamic mutexes are initialized in the constructor.
+  if (type_ == kStatic) {
+    switch (
+        ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) {
+      case 0:
+        // If critical_section_init_phase_ was 0 before the exchange, we
+        // are the first to test it and need to perform the initialization.
+        owner_thread_id_ = 0;
+        critical_section_ = new CRITICAL_SECTION;
+        ::InitializeCriticalSection(critical_section_);
+        // Updates the critical_section_init_phase_ to 2 to signal
+        // initialization complete.
+        GTEST_CHECK_(::InterlockedCompareExchange(
+                          &critical_section_init_phase_, 2L, 1L) ==
+                      1L);
+        break;
+      case 1:
+        // Somebody else is already initializing the mutex; spin until they
+        // are done.
+        while (::InterlockedCompareExchange(&critical_section_init_phase_,
+                                            2L,
+                                            2L) != 2L) {
+          // Possibly yields the rest of the thread's time slice to other
+          // threads.
+          ::Sleep(0);
+        }
+        break;
+
+      case 2:
+        break;  // The mutex is already initialized and ready for use.
+
+      default:
+        GTEST_CHECK_(false)
+            << "Unexpected value of critical_section_init_phase_ "
+            << "while initializing a static mutex.";
+    }
+  }
+}
+
+namespace {
+
+class ThreadWithParamSupport : public ThreadWithParamBase {
+ public:
+  static HANDLE CreateThread(Runnable* runnable,
+                             Notification* thread_can_start) {
+    ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
+    DWORD thread_id;
+    // TODO(yukawa): Consider to use _beginthreadex instead.
+    HANDLE thread_handle = ::CreateThread(
+        NULL,    // Default security.
+        0,       // Default stack size.
+        &ThreadWithParamSupport::ThreadMain,
+        param,   // Parameter to ThreadMainStatic
+        0x0,     // Default creation flags.
+        &thread_id);  // Need a valid pointer for the call to work under Win98.
+    GTEST_CHECK_(thread_handle != NULL) << "CreateThread failed with error "
+                                        << ::GetLastError() << ".";
+    if (thread_handle == NULL) {
+      delete param;
+    }
+    return thread_handle;
+  }
+
+ private:
+  struct ThreadMainParam {
+    ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
+        : runnable_(runnable),
+          thread_can_start_(thread_can_start) {
+    }
+    scoped_ptr<Runnable> runnable_;
+    // Does not own.
+    Notification* thread_can_start_;
+  };
+
+  static DWORD WINAPI ThreadMain(void* ptr) {
+    // Transfers ownership.
+    scoped_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
+    if (param->thread_can_start_ != NULL)
+      param->thread_can_start_->WaitForNotification();
+    param->runnable_->Run();
+    return 0;
+  }
+
+  // Prohibit instantiation.
+  ThreadWithParamSupport();
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+};
+
+}  // namespace
+
+ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+                                         Notification* thread_can_start)
+      : thread_(ThreadWithParamSupport::CreateThread(runnable,
+                                                     thread_can_start)) {
+}
+
+ThreadWithParamBase::~ThreadWithParamBase() {
+  Join();
+}
+
+void ThreadWithParamBase::Join() {
+  GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
+      << "Failed to join the thread with error " << ::GetLastError() << ".";
+}
+
+// Maps a thread to a set of ThreadIdToThreadLocals that have values
+// instantiated on that thread and notifies them when the thread exits.  A
+// ThreadLocal instance is expected to persist until all threads it has
+// values on have terminated.
+class ThreadLocalRegistryImpl {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+    DWORD current_thread = ::GetCurrentThreadId();
+    MutexLock lock(&mutex_);
+    ThreadIdToThreadLocals* const thread_to_thread_locals =
+        GetThreadLocalsMapLocked();
+    ThreadIdToThreadLocals::iterator thread_local_pos =
+        thread_to_thread_locals->find(current_thread);
+    if (thread_local_pos == thread_to_thread_locals->end()) {
+      thread_local_pos = thread_to_thread_locals->insert(
+          std::make_pair(current_thread, ThreadLocalValues())).first;
+      StartWatcherThreadFor(current_thread);
+    }
+    ThreadLocalValues& thread_local_values = thread_local_pos->second;
+    ThreadLocalValues::iterator value_pos =
+        thread_local_values.find(thread_local_instance);
+    if (value_pos == thread_local_values.end()) {
+      value_pos =
+          thread_local_values
+              .insert(std::make_pair(
+                  thread_local_instance,
+                  linked_ptr<ThreadLocalValueHolderBase>(
+                      thread_local_instance->NewValueForCurrentThread())))
+              .first;
+    }
+    return value_pos->second.get();
+  }
+
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadLocalValues data structure while holding the lock, but
+    // defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      for (ThreadIdToThreadLocals::iterator it =
+          thread_to_thread_locals->begin();
+          it != thread_to_thread_locals->end();
+          ++it) {
+        ThreadLocalValues& thread_local_values = it->second;
+        ThreadLocalValues::iterator value_pos =
+            thread_local_values.find(thread_local_instance);
+        if (value_pos != thread_local_values.end()) {
+          value_holders.push_back(value_pos->second);
+          thread_local_values.erase(value_pos);
+          // This 'if' can only be successful at most once, so theoretically we
+          // could break out of the loop here, but we don't bother doing so.
+        }
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+  static void OnThreadExit(DWORD thread_id) {
+    GTEST_CHECK_(thread_id != 0) << ::GetLastError();
+    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadIdToThreadLocals data structure while holding the
+    // lock, but defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      ThreadIdToThreadLocals::iterator thread_local_pos =
+          thread_to_thread_locals->find(thread_id);
+      if (thread_local_pos != thread_to_thread_locals->end()) {
+        ThreadLocalValues& thread_local_values = thread_local_pos->second;
+        for (ThreadLocalValues::iterator value_pos =
+            thread_local_values.begin();
+            value_pos != thread_local_values.end();
+            ++value_pos) {
+          value_holders.push_back(value_pos->second);
+        }
+        thread_to_thread_locals->erase(thread_local_pos);
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+ private:
+  // In a particular thread, maps a ThreadLocal object to its value.
+  typedef std::map<const ThreadLocalBase*,
+                   linked_ptr<ThreadLocalValueHolderBase> > ThreadLocalValues;
+  // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
+  // thread's ID.
+  typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
+
+  // Holds the thread id and thread handle that we pass from
+  // StartWatcherThreadFor to WatcherThreadFunc.
+  typedef std::pair<DWORD, HANDLE> ThreadIdAndHandle;
+
+  static void StartWatcherThreadFor(DWORD thread_id) {
+    // The returned handle will be kept in thread_map and closed by
+    // watcher_thread in WatcherThreadFunc.
+    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
+                                 FALSE,
+                                 thread_id);
+    GTEST_CHECK_(thread != NULL);
+    // We need to to pass a valid thread ID pointer into CreateThread for it
+    // to work correctly under Win98.
+    DWORD watcher_thread_id;
+    HANDLE watcher_thread = ::CreateThread(
+        NULL,   // Default security.
+        0,      // Default stack size
+        &ThreadLocalRegistryImpl::WatcherThreadFunc,
+        reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
+        CREATE_SUSPENDED,
+        &watcher_thread_id);
+    GTEST_CHECK_(watcher_thread != NULL);
+    // Give the watcher thread the same priority as ours to avoid being
+    // blocked by it.
+    ::SetThreadPriority(watcher_thread,
+                        ::GetThreadPriority(::GetCurrentThread()));
+    ::ResumeThread(watcher_thread);
+    ::CloseHandle(watcher_thread);
+  }
+
+  // Monitors exit from a given thread and notifies those
+  // ThreadIdToThreadLocals about thread termination.
+  static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
+    const ThreadIdAndHandle* tah =
+        reinterpret_cast<const ThreadIdAndHandle*>(param);
+    GTEST_CHECK_(
+        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    OnThreadExit(tah->first);
+    ::CloseHandle(tah->second);
+    delete tah;
+    return 0;
+  }
+
+  // Returns map of thread local instances.
+  static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
+    mutex_.AssertHeld();
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals;
+    return map;
+  }
+
+  // Protects access to GetThreadLocalsMapLocked() and its return value.
+  static Mutex mutex_;
+  // Protects access to GetThreadMapLocked() and its return value.
+  static Mutex thread_map_mutex_;
+};
+
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+
+ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+  return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
+      thread_local_instance);
+}
+
+void ThreadLocalRegistry::OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+  ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
+}
+
+#endif  // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
 
 #if GTEST_USES_POSIX_RE
 
@@ -222,7 +671,7 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
 }
 
 // Helper function used by ValidateRegex() to format error messages.
-String FormatRegexSyntaxError(const char* regex, int index) {
+std::string FormatRegexSyntaxError(const char* regex, int index) {
   return (Message() << "Syntax error at index " << index
           << " in simple regular expression \"" << regex << "\": ").GetString();
 }
@@ -429,15 +878,15 @@ const char kUnknownFile[] = "unknown file";
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
 GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
-  const char* const file_name = file == NULL ? kUnknownFile : file;
+  const std::string file_name(file == NULL ? kUnknownFile : file);
 
   if (line < 0) {
-    return String::Format("%s:", file_name).c_str();
+    return file_name + ":";
   }
 #ifdef _MSC_VER
-  return String::Format("%s(%d):", file_name, line).c_str();
+  return file_name + "(" + StreamableToString(line) + "):";
 #else
-  return String::Format("%s:%d:", file_name, line).c_str();
+  return file_name + ":" + StreamableToString(line) + ":";
 #endif  // _MSC_VER
 }
 
@@ -448,15 +897,14 @@ GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
 // to the file location it produces, unlike FormatFileLocation().
 GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
     const char* file, int line) {
-  const char* const file_name = file == NULL ? kUnknownFile : file;
+  const std::string file_name(file == NULL ? kUnknownFile : file);
 
   if (line < 0)
     return file_name;
   else
-    return String::Format("%s:%d", file_name, line).c_str();
+    return file_name + ":" + StreamableToString(line);
 }
 
-
 GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
     : severity_(severity) {
   const char* const marker =
@@ -477,10 +925,7 @@ GTestLog::~GTestLog() {
 }
 // Disable Microsoft deprecation warnings for POSIX functions called from
 // this class (creat, dup, dup2, and close)
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4996)
-#endif  // _MSC_VER
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
 
 #if GTEST_HAS_STREAM_REDIRECTION
 
@@ -488,8 +933,7 @@ GTestLog::~GTestLog() {
 class CapturedStream {
  public:
   // The ctor redirects the stream to a temporary file.
-  CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
 # if GTEST_OS_WINDOWS
     char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
     char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
@@ -505,15 +949,30 @@ class CapturedStream {
     GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
                                     << temp_file_path;
     filename_ = temp_file_path;
-#elif GTEST_OS_LINUX_ANDROID
-    char name_template[] = "/sdcard/captured_stderr.XXXXXX";
-    const int captured_fd = mkstemp(name_template);
-    filename_ = name_template;
 # else
-    // There's no guarantee that a test has write access to the
-    // current directory, so we create the temporary file in the /tmp
-    // directory instead.
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in the /tmp directory
+    // instead. We use /tmp on most systems, and /sdcard on Android.
+    // That's because Android doesn't have /tmp.
+#  if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /sdcard is directly accessible from native code
+    // and is the only location (unofficially) supported by the Android
+    // team. It's generally a symlink to the real SD Card mount point
+    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
+    // other OEM-customized locations. Never rely on these, and always
+    // use /sdcard.
+    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
+#  else
     char name_template[] = "/tmp/captured_stream.XXXXXX";
+#  endif  // GTEST_OS_LINUX_ANDROID
     const int captured_fd = mkstemp(name_template);
     filename_ = name_template;
 # endif  // GTEST_OS_WINDOWS
@@ -526,7 +985,7 @@ class CapturedStream {
     remove(filename_.c_str());
   }
 
-  String GetCapturedString() {
+  std::string GetCapturedString() {
     if (uncaptured_fd_ != -1) {
       // Restores the original stream.
       fflush(NULL);
@@ -536,18 +995,12 @@ class CapturedStream {
     }
 
     FILE* const file = posix::FOpen(filename_.c_str(), "r");
-    const String content = ReadEntireFile(file);
+    const std::string content = ReadEntireFile(file);
     posix::FClose(file);
     return content;
   }
 
  private:
-  // Reads the entire content of a file as a String.
-  static String ReadEntireFile(FILE* file);
-
-  // Returns the size (in bytes) of a file.
-  static size_t GetFileSize(FILE* file);
-
   const int fd_;  // A stream to capture.
   int uncaptured_fd_;
   // Name of the temporary file holding the stderr output.
@@ -556,38 +1009,7 @@ class CapturedStream {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
 };
 
-// Returns the size (in bytes) of a file.
-size_t CapturedStream::GetFileSize(FILE* file) {
-  fseek(file, 0, SEEK_END);
-  return static_cast<size_t>(ftell(file));
-}
-
-// Reads the entire content of a file as a string.
-String CapturedStream::ReadEntireFile(FILE* file) {
-  const size_t file_size = GetFileSize(file);
-  char* const buffer = new char[file_size];
-
-  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
-  size_t bytes_read = 0;       // # of bytes read so far
-
-  fseek(file, 0, SEEK_SET);
-
-  // Keeps reading the file until we cannot read further or the
-  // pre-determined file size is reached.
-  do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
-    bytes_read += bytes_last_read;
-  } while (bytes_last_read > 0 && bytes_read < file_size);
-
-  const String content(buffer, bytes_read);
-  delete[] buffer;
-
-  return content;
-}
-
-# ifdef _MSC_VER
-#  pragma warning(pop)
-# endif  // _MSC_VER
+GTEST_DISABLE_MSC_WARNINGS_POP_()
 
 static CapturedStream* g_captured_stderr = NULL;
 static CapturedStream* g_captured_stdout = NULL;
@@ -602,8 +1024,8 @@ void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
 }
 
 // Stops capturing the output stream and returns the captured string.
-String GetCapturedStream(CapturedStream** captured_stream) {
-  const String content = (*captured_stream)->GetCapturedString();
+std::string GetCapturedStream(CapturedStream** captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
 
   delete *captured_stream;
   *captured_stream = NULL;
@@ -622,21 +1044,79 @@ void CaptureStderr() {
 }
 
 // Stops capturing stdout and returns the captured string.
-String GetCapturedStdout() { return GetCapturedStream(&g_captured_stdout); }
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
 
 // Stops capturing stderr and returns the captured string.
-String GetCapturedStderr() { return GetCapturedStream(&g_captured_stderr); }
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
+std::string TempDir() {
+#if GTEST_OS_WINDOWS_MOBILE
+  return "\\temp\\";
+#elif GTEST_OS_WINDOWS
+  const char* temp_dir = posix::GetEnv("TEMP");
+  if (temp_dir == NULL || temp_dir[0] == '\0')
+    return "\\temp\\";
+  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+    return temp_dir;
+  else
+    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_LINUX_ANDROID
+  return "/sdcard/";
+#else
+  return "/tmp/";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+size_t GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+std::string ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
 #if GTEST_HAS_DEATH_TEST
 
-// A copy of all command line arguments.  Set by InitGoogleTest().
-::std::vector<String> g_argvs;
+static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
+                                        NULL;  // Owned.
 
-// Returns the command line as a vector of strings.
-const ::std::vector<String>& GetArgvs() { return g_argvs; }
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
+  if (g_injected_test_argvs != argvs)
+    delete g_injected_test_argvs;
+  g_injected_test_argvs = argvs;
+}
 
+const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
+  if (g_injected_test_argvs != NULL) {
+    return *g_injected_test_argvs;
+  }
+  return GetArgvs();
+}
 #endif  // GTEST_HAS_DEATH_TEST
 
 #if GTEST_OS_WINDOWS_MOBILE
@@ -651,8 +1131,8 @@ void Abort() {
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "GTEST_FOO" in the open-source version.
-static String FlagToEnvVar(const char* flag) {
-  const String full_flag =
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string full_flag =
       (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
 
   Message env_var;
@@ -709,7 +1189,10 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
 //
 // The value is considered true iff it's not "0".
 bool BoolFromGTestEnv(const char* flag, bool default_value) {
-  const String env_var = FlagToEnvVar(flag);
+#if defined(GTEST_GET_BOOL_FROM_ENV_)
+  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
+  const std::string env_var = FlagToEnvVar(flag);
   const char* const string_value = posix::GetEnv(env_var.c_str());
   return string_value == NULL ?
       default_value : strcmp(string_value, "0") != 0;
@@ -719,7 +1202,10 @@ bool BoolFromGTestEnv(const char* flag, bool default_value) {
 // variable corresponding to the given flag; if it isn't set or
 // doesn't represent a valid 32-bit integer, returns default_value.
 Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
-  const String env_var = FlagToEnvVar(flag);
+#if defined(GTEST_GET_INT32_FROM_ENV_)
+  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+  const std::string env_var = FlagToEnvVar(flag);
   const char* const string_value = posix::GetEnv(env_var.c_str());
   if (string_value == NULL) {
     // The environment variable is not set.
@@ -740,25 +1226,34 @@ Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
 
 // Reads and returns the string environment variable corresponding to
 // the given flag; if it's not set, returns default_value.
-const char* StringFromGTestEnv(const char* flag, const char* default_value) {
-  const String env_var = FlagToEnvVar(flag);
-  const char* const value = posix::GetEnv(env_var.c_str());
-  return value == NULL ? default_value : value;
-}
+std::string StringFromGTestEnv(const char* flag, const char* default_value) {
+#if defined(GTEST_GET_STRING_FROM_ENV_)
+  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* value = posix::GetEnv(env_var.c_str());
+  if (value != NULL) {
+    return value;
+  }
 
-// Pin the vtables to this file.
-#if GTEST_HAS_PTHREAD
-ThreadWithParamBase::~ThreadWithParamBase() {}
-ThreadLocalValueHolderBase::~ThreadLocalValueHolderBase() {}
-#endif
-TestFactoryBase::~TestFactoryBase() {}
+  // As a special case for the 'output' flag, if GTEST_OUTPUT is not
+  // set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+  // system.  The value of XML_OUTPUT_FILE is a filename without the
+  // "xml:" prefix of GTEST_OUTPUT.
+  //
+  // The net priority order after flag processing is thus:
+  //   --gtest_output command line flag
+  //   GTEST_OUTPUT environment variable
+  //   XML_OUTPUT_FILE environment variable
+  //   'default_value'
+  if (strcmp(flag, "output") == 0) {
+    value = posix::GetEnv("XML_OUTPUT_FILE");
+    if (value != NULL) {
+      return std::string("xml:") + value;
+    }
+  }
+  return default_value;
+}
 
 }  // namespace internal
 }  // namespace testing
-
-// Pin the vtable to this file.
-#if !GTEST_NO_LLVM_RAW_OSTREAM
-namespace llvm {
-void convertible_fwd_ostream::anchor() {}
-}
-#endif
diff --git a/utils/unittest/googletest/src/gtest-printers.cc b/utils/unittest/googletest/src/gtest-printers.cc
index 205a39425f0..a2df412f8a2 100644
--- a/utils/unittest/googletest/src/gtest-printers.cc
+++ b/utils/unittest/googletest/src/gtest-printers.cc
@@ -45,6 +45,7 @@
 #include "gtest/gtest-printers.h"
 #include <ctype.h>
 #include <stdio.h>
+#include <cwchar>
 #include <ostream>  // NOLINT
 #include <string>
 #include "gtest/internal/gtest-port.h"
@@ -55,15 +56,10 @@ namespace {
 
 using ::std::ostream;
 
-#if GTEST_OS_WINDOWS_MOBILE  // Windows CE does not define _snprintf_s.
-# define snprintf _snprintf
-#elif _MSC_VER >= 1400  // VC 8.0 and later deprecate snprintf and _snprintf.
-# define snprintf _snprintf_s
-#elif _MSC_VER
-# define snprintf _snprintf
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
 // Prints a segment of bytes in the given object.
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
                                 size_t count, ostream* os) {
   char text[5] = "";
@@ -77,7 +73,7 @@ void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
       else
         *os << '-';
     }
-    snprintf(text, sizeof(text), "%02X", obj_bytes[j]);
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
     *os << text;
   }
 }
@@ -127,7 +123,7 @@ namespace internal {
 // Depending on the value of a char (or wchar_t), we print it in one
 // of three formats:
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
-//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
+//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
 enum CharFormat {
   kAsIs,
@@ -184,16 +180,16 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
         *os << static_cast<char>(c);
         return kAsIs;
       } else {
-        *os << String::Format("\\x%X", static_cast<UnsignedChar>(c));
+        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
         return kHexEscape;
       }
   }
   return kSpecialEscape;
 }
 
-// Prints a char c as if it's part of a string literal, escaping it when
+// Prints a wchar_t c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
-static CharFormat PrintAsWideStringLiteralTo(wchar_t c, ostream* os) {
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
   switch (c) {
     case L'\'':
       *os << "'";
@@ -208,8 +204,9 @@ static CharFormat PrintAsWideStringLiteralTo(wchar_t c, ostream* os) {
 
 // Prints a char c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
-static CharFormat PrintAsNarrowStringLiteralTo(char c, ostream* os) {
-  return PrintAsWideStringLiteralTo(static_cast<unsigned char>(c), os);
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(
+      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
 }
 
 // Prints a wide or narrow character c and its code.  '\0' is printed
@@ -228,16 +225,15 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
   // obvious).
   if (c == 0)
     return;
-  *os << " (" << String::Format("%d", c).c_str();
+  *os << " (" << static_cast<int>(c);
 
-  // For more convenience, we print c's code again in hexadecimal,
+  // For more convenience, we print c's code again in hexidecimal,
   // unless c was already printed in the form '\x##' or the code is in
   // [1, 9].
   if (format == kHexEscape || (1 <= c && c <= 9)) {
     // Do nothing.
   } else {
-    *os << String::Format(", 0x%X",
-                          static_cast<UnsignedChar>(c)).c_str();
+    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
   }
   *os << ")";
 }
@@ -255,48 +251,69 @@ void PrintTo(wchar_t wc, ostream* os) {
   PrintCharAndCodeTo<wchar_t>(wc, os);
 }
 
-// Prints the given array of characters to the ostream.
-// The array starts at *begin, the length is len, it may include '\0' characters
-// and may not be null-terminated.
-static void PrintCharsAsStringTo(const char* begin, size_t len, ostream* os) {
-  *os << "\"";
+// Prints the given array of characters to the ostream.  CharType must be either
+// char or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void PrintCharsAsStringTo(
+    const CharType* begin, size_t len, ostream* os) {
+  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+  *os << kQuoteBegin;
   bool is_previous_hex = false;
   for (size_t index = 0; index < len; ++index) {
-    const char cur = begin[index];
+    const CharType cur = begin[index];
     if (is_previous_hex && IsXDigit(cur)) {
       // Previous character is of '\x..' form and this character can be
       // interpreted as another hexadecimal digit in its number. Break string to
       // disambiguate.
-      *os << "\" \"";
+      *os << "\" " << kQuoteBegin;
     }
-    is_previous_hex = PrintAsNarrowStringLiteralTo(cur, os) == kHexEscape;
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
   }
   *os << "\"";
 }
 
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void UniversalPrintCharArray(
+    const CharType* begin, size_t len, ostream* os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
 // Prints a (const) char array of 'len' elements, starting at address 'begin'.
 void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
-  PrintCharsAsStringTo(begin, len, os);
+  UniversalPrintCharArray(begin, len, os);
 }
 
-// Prints the given array of wide characters to the ostream.
-// The array starts at *begin, the length is len, it may include L'\0'
-// characters and may not be null-terminated.
-static void PrintWideCharsAsStringTo(const wchar_t* begin, size_t len,
-                                     ostream* os) {
-  *os << "L\"";
-  bool is_previous_hex = false;
-  for (size_t index = 0; index < len; ++index) {
-    const wchar_t cur = begin[index];
-    if (is_previous_hex && isascii(cur) && IsXDigit(static_cast<char>(cur))) {
-      // Previous character is of '\x..' form and this character can be
-      // interpreted as another hexadecimal digit in its number. Break string to
-      // disambiguate.
-      *os << "\" L\"";
-    }
-    is_previous_hex = PrintAsWideStringLiteralTo(cur, os) == kHexEscape;
-  }
-  *os << "\"";
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
 }
 
 // Prints the given C string to the ostream.
@@ -322,7 +339,7 @@ void PrintTo(const wchar_t* s, ostream* os) {
     *os << "NULL";
   } else {
     *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintWideCharsAsStringTo(s, wcslen(s), os);
+    PrintCharsAsStringTo(s, std::wcslen(s), os);
   }
 }
 #endif  // wchar_t is native
@@ -341,13 +358,13 @@ void PrintStringTo(const ::std::string& s, ostream* os) {
 // Prints a ::wstring object.
 #if GTEST_HAS_GLOBAL_WSTRING
 void PrintWideStringTo(const ::wstring& s, ostream* os) {
-  PrintWideCharsAsStringTo(s.data(), s.size(), os);
+  PrintCharsAsStringTo(s.data(), s.size(), os);
 }
 #endif  // GTEST_HAS_GLOBAL_WSTRING
 
 #if GTEST_HAS_STD_WSTRING
 void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
-  PrintWideCharsAsStringTo(s.data(), s.size(), os);
+  PrintCharsAsStringTo(s.data(), s.size(), os);
 }
 #endif  // GTEST_HAS_STD_WSTRING
 
diff --git a/utils/unittest/googletest/src/gtest-test-part.cc b/utils/unittest/googletest/src/gtest-test-part.cc
index 5ddc67c1c9e..fb0e35425e1 100644
--- a/utils/unittest/googletest/src/gtest-test-part.cc
+++ b/utils/unittest/googletest/src/gtest-test-part.cc
@@ -35,9 +35,9 @@
 
 // Indicates that this translation unit is part of Google Test's
 // implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
 #define GTEST_IMPLEMENTATION_ 1
 #include "src/gtest-internal-inl.h"
 #undef GTEST_IMPLEMENTATION_
@@ -48,10 +48,10 @@ using internal::GetUnitTestImpl;
 
 // Gets the summary of the failure message by omitting the stack trace
 // in it.
-internal::String TestPartResult::ExtractSummary(const char* message) {
+std::string TestPartResult::ExtractSummary(const char* message) {
   const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
-  return stack_trace == NULL ? internal::String(message) :
-      internal::String(message, stack_trace - message);
+  return stack_trace == NULL ? message :
+      std::string(message, stack_trace);
 }
 
 // Prints a TestPartResult object.
diff --git a/utils/unittest/googletest/src/gtest-typed-test.cc b/utils/unittest/googletest/src/gtest-typed-test.cc
index a5cc88f9205..df1eef4754e 100644
--- a/utils/unittest/googletest/src/gtest-typed-test.cc
+++ b/utils/unittest/googletest/src/gtest-typed-test.cc
@@ -45,33 +45,41 @@ static const char* SkipSpaces(const char* str) {
   return str;
 }
 
+static std::vector<std::string> SplitIntoTestNames(const char* src) {
+  std::vector<std::string> name_vec;
+  src = SkipSpaces(src);
+  for (; src != NULL; src = SkipComma(src)) {
+    name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
+  }
+  return name_vec;
+}
+
 // Verifies that registered_tests match the test names in
-// defined_test_names_; returns registered_tests if successful, or
+// registered_tests_; returns registered_tests if successful, or
 // aborts the program otherwise.
 const char* TypedTestCasePState::VerifyRegisteredTestNames(
     const char* file, int line, const char* registered_tests) {
-  typedef ::std::set<const char*>::const_iterator DefinedTestIter;
+  typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
   registered_ = true;
 
-  // Skip initial whitespace in registered_tests since some
-  // preprocessors prefix stringizied literals with whitespace.
-  registered_tests = SkipSpaces(registered_tests);
+  std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
 
   Message errors;
-  ::std::set<String> tests;
-  for (const char* names = registered_tests; names != NULL;
-       names = SkipComma(names)) {
-    const String name = GetPrefixUntilComma(names);
+
+  std::set<std::string> tests;
+  for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
+       name_it != name_vec.end(); ++name_it) {
+    const std::string& name = *name_it;
     if (tests.count(name) != 0) {
       errors << "Test " << name << " is listed more than once.\n";
       continue;
     }
 
     bool found = false;
-    for (DefinedTestIter it = defined_test_names_.begin();
-         it != defined_test_names_.end();
+    for (RegisteredTestIter it = registered_tests_.begin();
+         it != registered_tests_.end();
          ++it) {
-      if (name == *it) {
+      if (name == it->first) {
         found = true;
         break;
       }
@@ -85,15 +93,15 @@ const char* TypedTestCasePState::VerifyRegisteredTestNames(
     }
   }
 
-  for (DefinedTestIter it = defined_test_names_.begin();
-       it != defined_test_names_.end();
+  for (RegisteredTestIter it = registered_tests_.begin();
+       it != registered_tests_.end();
        ++it) {
-    if (tests.count(*it) == 0) {
-      errors << "You forgot to list test " << *it << ".\n";
+    if (tests.count(it->first) == 0) {
+      errors << "You forgot to list test " << it->first << ".\n";
     }
   }
 
-  const String& errors_str = errors.GetString();
+  const std::string& errors_str = errors.GetString();
   if (errors_str != "") {
     fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
             errors_str.c_str());
diff --git a/utils/unittest/googletest/src/gtest.cc b/utils/unittest/googletest/src/gtest.cc
index 57807646ea1..d882ab2e36a 100644
--- a/utils/unittest/googletest/src/gtest.cc
+++ b/utils/unittest/googletest/src/gtest.cc
@@ -32,6 +32,7 @@
 // The Google C++ Testing Framework (Google Test)
 
 #include "gtest/gtest.h"
+#include "gtest/internal/custom/gtest.h"
 #include "gtest/gtest-spi.h"
 
 #include <ctype.h>
@@ -39,10 +40,15 @@
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <time.h>
 #include <wchar.h>
 #include <wctype.h>
 
 #include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <map>
 #include <ostream>  // NOLINT
 #include <sstream>
 #include <vector>
@@ -77,6 +83,7 @@
 #elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
 
 # include <windows.h>  // NOLINT
+# undef min
 
 #elif GTEST_OS_WINDOWS  // We are on Windows proper.
 
@@ -99,6 +106,7 @@
 // cpplint thinks that the header is already included, so we want to
 // silence it.
 # include <windows.h>  // NOLINT
+# undef min
 
 #else
 
@@ -121,6 +129,8 @@
 #if GTEST_CAN_STREAM_RESULTS_
 # include <arpa/inet.h>  // NOLINT
 # include <netdb.h>  // NOLINT
+# include <sys/socket.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
 #endif
 
 // Indicates that this translation unit is part of Google Test's
@@ -179,6 +189,16 @@ bool g_help_flag = false;
 
 }  // namespace internal
 
+static const char* GetDefaultFilter() {
+#ifdef GTEST_TEST_FILTER_ENV_VAR_
+  const char* const testbridge_test_only = getenv(GTEST_TEST_FILTER_ENV_VAR_);
+  if (testbridge_test_only != NULL) {
+    return testbridge_test_only;
+  }
+#endif  // GTEST_TEST_FILTER_ENV_VAR_
+  return kUniversalFilter;
+}
+
 GTEST_DEFINE_bool_(
     also_run_disabled_tests,
     internal::BoolFromGTestEnv("also_run_disabled_tests", false),
@@ -201,11 +221,11 @@ GTEST_DEFINE_string_(
     "Whether to use colors in the output.  Valid values: yes, no, "
     "and auto.  'auto' means to use colors if the output is "
     "being sent to a terminal and the TERM environment variable "
-    "is set to xterm, xterm-color, xterm-256color, linux or cygwin.");
+    "is set to a terminal type that supports colors.");
 
 GTEST_DEFINE_string_(
     filter,
-    internal::StringFromGTestEnv("filter", kUniversalFilter),
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
     "A colon-separated list of glob (not regex) patterns "
     "for filtering the tests to run, optionally followed by a "
     "'-' and a : separated list of negative patterns (tests to "
@@ -276,6 +296,13 @@ GTEST_DEFINE_bool_(
     "if exceptions are enabled or exit the program with a non-zero code "
     "otherwise.");
 
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DEFINE_string_(
+    flagfile,
+    internal::StringFromGTestEnv("flagfile", ""),
+    "This flag specifies the flagfile to read command-line flags from.");
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
 namespace internal {
 
 // Generates a random number from [0, range), using a Linear
@@ -300,13 +327,7 @@ UInt32 Random::Generate(UInt32 range) {
 // GTestIsInitialized() returns true iff the user has initialized
 // Google Test.  Useful for catching the user mistake of not initializing
 // Google Test before calling RUN_ALL_TESTS().
-//
-// A user must call testing::InitGoogleTest() to initialize Google
-// Test.  g_init_gtest_count is set to the number of times
-// InitGoogleTest() has been called.  We don't protect this variable
-// under a mutex as it is only accessed in the main thread.
-int g_init_gtest_count = 0;
-static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
+static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
 
 // Iterates over a vector of TestCases, keeping a running sum of the
 // results of calling a given int-returning method on each.
@@ -360,10 +381,18 @@ void AssertHelper::operator=(const Message& message) const {
 }
 
 // Mutex for linked pointers.
-GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
 
-// Application pathname gotten in InitGoogleTest.
-String g_executable_path;
+// A copy of all command line arguments.  Set by InitGoogleTest().
+::std::vector<testing::internal::string> g_argvs;
+
+const ::std::vector<testing::internal::string>& GetArgvs() {
+#if defined(GTEST_CUSTOM_GET_ARGVS_)
+  return GTEST_CUSTOM_GET_ARGVS_();
+#else  // defined(GTEST_CUSTOM_GET_ARGVS_)
+  return g_argvs;
+#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
+}
 
 // Returns the current application's name, removing directory path if that
 // is present.
@@ -371,9 +400,9 @@ FilePath GetCurrentExecutableName() {
   FilePath result;
 
 #if GTEST_OS_WINDOWS
-  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
+  result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
 #else
-  result.Set(FilePath(g_executable_path));
+  result.Set(FilePath(GetArgvs()[0]));
 #endif  // GTEST_OS_WINDOWS
 
   return result.RemoveDirectoryName();
@@ -382,29 +411,29 @@ FilePath GetCurrentExecutableName() {
 // Functions for processing the gtest_output flag.
 
 // Returns the output format, or "" for normal printed output.
-String UnitTestOptions::GetOutputFormat() {
+std::string UnitTestOptions::GetOutputFormat() {
   const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL) return String("");
+  if (gtest_output_flag == NULL) return std::string("");
 
   const char* const colon = strchr(gtest_output_flag, ':');
   return (colon == NULL) ?
-      String(gtest_output_flag) :
-      String(gtest_output_flag, colon - gtest_output_flag);
+      std::string(gtest_output_flag) :
+      std::string(gtest_output_flag, colon - gtest_output_flag);
 }
 
 // Returns the name of the requested output file, or the default if none
 // was explicitly specified.
-String UnitTestOptions::GetAbsolutePathToOutputFile() {
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
   const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
   if (gtest_output_flag == NULL)
-    return String("");
+    return "";
 
   const char* const colon = strchr(gtest_output_flag, ':');
   if (colon == NULL)
-    return String(internal::FilePath::ConcatPaths(
-               internal::FilePath(
-                   UnitTest::GetInstance()->original_working_dir()),
-               internal::FilePath(kDefaultOutputFile)).ToString() );
+    return internal::FilePath::ConcatPaths(
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile)).string();
 
   internal::FilePath output_name(colon + 1);
   if (!output_name.IsAbsolutePath())
@@ -417,12 +446,12 @@ String UnitTestOptions::GetAbsolutePathToOutputFile() {
         internal::FilePath(colon + 1));
 
   if (!output_name.IsDirectory())
-    return output_name.ToString();
+    return output_name.string();
 
   internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
       output_name, internal::GetCurrentExecutableName(),
       GetOutputFormat().c_str()));
-  return result.ToString();
+  return result.string();
 }
 
 // Returns true iff the wildcard pattern matches the string.  The
@@ -447,7 +476,8 @@ bool UnitTestOptions::PatternMatchesString(const char *pattern,
   }
 }
 
-bool UnitTestOptions::MatchesFilter(const String& name, const char* filter) {
+bool UnitTestOptions::MatchesFilter(
+    const std::string& name, const char* filter) {
   const char *cur_pattern = filter;
   for (;;) {
     if (PatternMatchesString(cur_pattern, name.c_str())) {
@@ -467,28 +497,24 @@ bool UnitTestOptions::MatchesFilter(const String& name, const char* filter) {
   }
 }
 
-// TODO(keithray): move String function implementations to gtest-string.cc.
-
 // Returns true iff the user-specified filter matches the test case
 // name and the test name.
-bool UnitTestOptions::FilterMatchesTest(const String &test_case_name,
-                                        const String &test_name) {
-  const String& full_name = String::Format("%s.%s",
-                                           test_case_name.c_str(),
-                                           test_name.c_str());
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
+                                        const std::string &test_name) {
+  const std::string& full_name = test_case_name + "." + test_name.c_str();
 
   // Split --gtest_filter at '-', if there is one, to separate into
   // positive filter and negative filter portions
   const char* const p = GTEST_FLAG(filter).c_str();
   const char* const dash = strchr(p, '-');
-  String positive;
-  String negative;
+  std::string positive;
+  std::string negative;
   if (dash == NULL) {
     positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
-    negative = String("");
+    negative = "";
   } else {
-    positive = String(p, dash - p);  // Everything up to the dash
-    negative = String(dash+1);       // Everything after the dash
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
     if (positive.empty()) {
       // Treat '-test1' as the same as '*-test1'
       positive = kUniversalFilter;
@@ -597,7 +623,7 @@ TypeId GetTestTypeId() {
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
-const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
 
 // This predicate-formatter checks that 'results' contains a test part
 // failure of the given type and that the failure message contains the
@@ -608,7 +634,7 @@ AssertionResult HasOneFailure(const char* /* results_expr */,
                               const TestPartResultArray& results,
                               TestPartResult::Type type,
                               const string& substr) {
-  const String expected(type == TestPartResult::kFatalFailure ?
+  const std::string expected(type == TestPartResult::kFatalFailure ?
                         "1 fatal failure" :
                         "1 non-fatal failure");
   Message msg;
@@ -731,11 +757,22 @@ int UnitTestImpl::failed_test_count() const {
   return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
 }
 
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_,
+                             &TestCase::reportable_disabled_test_count);
+}
+
 // Gets the number of disabled tests.
 int UnitTestImpl::disabled_test_count() const {
   return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
 }
 
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
+}
+
 // Gets the number of all tests.
 int UnitTestImpl::total_test_count() const {
   return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
@@ -746,7 +783,7 @@ int UnitTestImpl::test_to_run_count() const {
   return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
 }
 
-// Returns the current OS stack trace as a String.
+// Returns the current OS stack trace as an std::string.
 //
 // The maximum number of stack frames to be included is specified by
 // the gtest_stack_trace_depth flag.  The skip_count parameter
@@ -756,9 +793,13 @@ int UnitTestImpl::test_to_run_count() const {
 // For example, if Foo() calls Bar(), which in turn calls
 // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
 // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-String UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
-  (void)skip_count;
-  return String("");
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  return os_stack_trace_getter()->CurrentStackTrace(
+      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
+      skip_count + 1
+      // Skips the user-specified number of frames plus this function
+      // itself.
+      );  // NOLINT
 }
 
 // Returns the current time in milliseconds.
@@ -787,21 +828,13 @@ TimeInMillis GetTimeInMillis() {
 #elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
   __timeb64 now;
 
-# ifdef _MSC_VER
-
   // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
   // (deprecated function) there.
   // TODO(kenton@google.com): Use GetTickCount()?  Or use
   //   SystemTimeToFileTime()
-#  pragma warning(push)          // Saves the current warning state.
-#  pragma warning(disable:4996)  // Temporarily disables warning 4996.
-  _ftime64(&now);
-#  pragma warning(pop)           // Restores the warning state.
-# else
-
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
   _ftime64(&now);
-
-# endif  // _MSC_VER
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
 
   return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
 #elif GTEST_HAS_GETTIMEOFDAY_
@@ -815,41 +848,7 @@ TimeInMillis GetTimeInMillis() {
 
 // Utilities
 
-// class String
-
-// Returns the input enclosed in double quotes if it's not NULL;
-// otherwise returns "(null)".  For example, "\"Hello\"" is returned
-// for input "Hello".
-//
-// This is useful for printing a C string in the syntax of a literal.
-//
-// Known issue: escape sequences are not handled yet.
-String String::ShowCStringQuoted(const char* c_str) {
-  return c_str ? String::Format("\"%s\"", c_str) : String("(null)");
-}
-
-// Copies at most length characters from str into a newly-allocated
-// piece of memory of size length+1.  The memory is allocated with new[].
-// A terminating null byte is written to the memory, and a pointer to it
-// is returned.  If str is NULL, NULL is returned.
-static char* CloneString(const char* str, size_t length) {
-  if (str == NULL) {
-    return NULL;
-  } else {
-    char* const clone = new char[length + 1];
-    posix::StrNCpy(clone, str, length);
-    clone[length] = '\0';
-    return clone;
-  }
-}
-
-// Clones a 0-terminated C string, allocating memory using new.  The
-// caller is responsible for deleting[] the return value.  Returns the
-// cloned string, or NULL if the input is NULL.
-const char * String::CloneCString(const char* c_str) {
-  return (c_str == NULL) ?
-                    NULL : CloneString(c_str, strlen(c_str));
-}
+// class String.
 
 #if GTEST_OS_WINDOWS_MOBILE
 // Creates a UTF-16 wide string from the given ANSI string, allocating
@@ -906,11 +905,6 @@ bool String::CStringEquals(const char * lhs, const char * rhs) {
 // encoding, and streams the result to the given Message object.
 static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
                                      Message* msg) {
-  // TODO(wan): consider allowing a testing::String object to
-  // contain '\0'.  This will make it behave more like std::string,
-  // and will allow ToUtf8String() to return the correct encoding
-  // for '\0' s.t. we can get rid of the conditional here (and in
-  // several other places).
   for (size_t i = 0; i != length; ) {  // NOLINT
     if (wstr[i] != L'\0') {
       *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
@@ -925,8 +919,45 @@ static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
 
 #endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
 
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
 }  // namespace internal
 
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator <<(const wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator <<(wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
 #if GTEST_HAS_STD_WSTRING
 // Converts the given wide string to a narrow string using the UTF-8
 // encoding, and streams the result to this Message object.
@@ -945,6 +976,12 @@ Message& Message::operator <<(const ::wstring& wstr) {
 }
 #endif  // GTEST_HAS_GLOBAL_WSTRING
 
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
 // AssertionResult constructors.
 // Used in EXPECT_TRUE/FALSE(assertion_result).
 AssertionResult::AssertionResult(const AssertionResult& other)
@@ -954,6 +991,13 @@ AssertionResult::AssertionResult(const AssertionResult& other)
                static_cast< ::std::string*>(NULL)) {
 }
 
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
 // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
 AssertionResult AssertionResult::operator!() const {
   AssertionResult negation(!success_);
@@ -980,6 +1024,276 @@ AssertionResult AssertionFailure(const Message& message) {
 
 namespace internal {
 
+namespace edit_distance {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
+                                            const std::vector<size_t>& right) {
+  std::vector<std::vector<double> > costs(
+      left.size() + 1, std::vector<double>(right.size() + 1));
+  std::vector<std::vector<EditType> > best_move(
+      left.size() + 1, std::vector<EditType>(right.size() + 1));
+
+  // Populate for empty right.
+  for (size_t l_i = 0; l_i < costs.size(); ++l_i) {
+    costs[l_i][0] = static_cast<double>(l_i);
+    best_move[l_i][0] = kRemove;
+  }
+  // Populate for empty left.
+  for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) {
+    costs[0][r_i] = static_cast<double>(r_i);
+    best_move[0][r_i] = kAdd;
+  }
+
+  for (size_t l_i = 0; l_i < left.size(); ++l_i) {
+    for (size_t r_i = 0; r_i < right.size(); ++r_i) {
+      if (left[l_i] == right[r_i]) {
+        // Found a match. Consume it.
+        costs[l_i + 1][r_i + 1] = costs[l_i][r_i];
+        best_move[l_i + 1][r_i + 1] = kMatch;
+        continue;
+      }
+
+      const double add = costs[l_i + 1][r_i];
+      const double remove = costs[l_i][r_i + 1];
+      const double replace = costs[l_i][r_i];
+      if (add < remove && add < replace) {
+        costs[l_i + 1][r_i + 1] = add + 1;
+        best_move[l_i + 1][r_i + 1] = kAdd;
+      } else if (remove < add && remove < replace) {
+        costs[l_i + 1][r_i + 1] = remove + 1;
+        best_move[l_i + 1][r_i + 1] = kRemove;
+      } else {
+        // We make replace a little more expensive than add/remove to lower
+        // their priority.
+        costs[l_i + 1][r_i + 1] = replace + 1.00001;
+        best_move[l_i + 1][r_i + 1] = kReplace;
+      }
+    }
+  }
+
+  // Reconstruct the best path. We do it in reverse order.
+  std::vector<EditType> best_path;
+  for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) {
+    EditType move = best_move[l_i][r_i];
+    best_path.push_back(move);
+    l_i -= move != kAdd;
+    r_i -= move != kRemove;
+  }
+  std::reverse(best_path.begin(), best_path.end());
+  return best_path;
+}
+
+namespace {
+
+// Helper class to convert string into ids with deduplication.
+class InternalStrings {
+ public:
+  size_t GetId(const std::string& str) {
+    IdMap::iterator it = ids_.find(str);
+    if (it != ids_.end()) return it->second;
+    size_t id = ids_.size();
+    return ids_[str] = id;
+  }
+
+ private:
+  typedef std::map<std::string, size_t> IdMap;
+  IdMap ids_;
+};
+
+}  // namespace
+
+std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right) {
+  std::vector<size_t> left_ids, right_ids;
+  {
+    InternalStrings intern_table;
+    for (size_t i = 0; i < left.size(); ++i) {
+      left_ids.push_back(intern_table.GetId(left[i]));
+    }
+    for (size_t i = 0; i < right.size(); ++i) {
+      right_ids.push_back(intern_table.GetId(right[i]));
+    }
+  }
+  return CalculateOptimalEdits(left_ids, right_ids);
+}
+
+namespace {
+
+// Helper class that holds the state for one hunk and prints it out to the
+// stream.
+// It reorders adds/removes when possible to group all removes before all
+// adds. It also adds the hunk header before printint into the stream.
+class Hunk {
+ public:
+  Hunk(size_t left_start, size_t right_start)
+      : left_start_(left_start),
+        right_start_(right_start),
+        adds_(),
+        removes_(),
+        common_() {}
+
+  void PushLine(char edit, const char* line) {
+    switch (edit) {
+      case ' ':
+        ++common_;
+        FlushEdits();
+        hunk_.push_back(std::make_pair(' ', line));
+        break;
+      case '-':
+        ++removes_;
+        hunk_removes_.push_back(std::make_pair('-', line));
+        break;
+      case '+':
+        ++adds_;
+        hunk_adds_.push_back(std::make_pair('+', line));
+        break;
+    }
+  }
+
+  void PrintTo(std::ostream* os) {
+    PrintHeader(os);
+    FlushEdits();
+    for (std::list<std::pair<char, const char*> >::const_iterator it =
+             hunk_.begin();
+         it != hunk_.end(); ++it) {
+      *os << it->first << it->second << "\n";
+    }
+  }
+
+  bool has_edits() const { return adds_ || removes_; }
+
+ private:
+  void FlushEdits() {
+    hunk_.splice(hunk_.end(), hunk_removes_);
+    hunk_.splice(hunk_.end(), hunk_adds_);
+  }
+
+  // Print a unified diff header for one hunk.
+  // The format is
+  //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
+  // where the left/right parts are ommitted if unnecessary.
+  void PrintHeader(std::ostream* ss) const {
+    *ss << "@@ ";
+    if (removes_) {
+      *ss << "-" << left_start_ << "," << (removes_ + common_);
+    }
+    if (removes_ && adds_) {
+      *ss << " ";
+    }
+    if (adds_) {
+      *ss << "+" << right_start_ << "," << (adds_ + common_);
+    }
+    *ss << " @@\n";
+  }
+
+  size_t left_start_, right_start_;
+  size_t adds_, removes_, common_;
+  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
+};
+
+}  // namespace
+
+// Create a list of diff hunks in Unified diff format.
+// Each hunk has a header generated by PrintHeader above plus a body with
+// lines prefixed with ' ' for no change, '-' for deletion and '+' for
+// addition.
+// 'context' represents the desired unchanged prefix/suffix around the diff.
+// If two hunks are close enough that their contexts overlap, then they are
+// joined into one hunk.
+std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                              const std::vector<std::string>& right,
+                              size_t context) {
+  const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
+
+  size_t l_i = 0, r_i = 0, edit_i = 0;
+  std::stringstream ss;
+  while (edit_i < edits.size()) {
+    // Find first edit.
+    while (edit_i < edits.size() && edits[edit_i] == kMatch) {
+      ++l_i;
+      ++r_i;
+      ++edit_i;
+    }
+
+    // Find the first line to include in the hunk.
+    const size_t prefix_context = std::min(l_i, context);
+    Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
+    for (size_t i = prefix_context; i > 0; --i) {
+      hunk.PushLine(' ', left[l_i - i].c_str());
+    }
+
+    // Iterate the edits until we found enough suffix for the hunk or the input
+    // is over.
+    size_t n_suffix = 0;
+    for (; edit_i < edits.size(); ++edit_i) {
+      if (n_suffix >= context) {
+        // Continue only if the next hunk is very close.
+        std::vector<EditType>::const_iterator it = edits.begin() + edit_i;
+        while (it != edits.end() && *it == kMatch) ++it;
+        if (it == edits.end() || (it - edits.begin()) - edit_i >= context) {
+          // There is no next edit or it is too far away.
+          break;
+        }
+      }
+
+      EditType edit = edits[edit_i];
+      // Reset count when a non match is found.
+      n_suffix = edit == kMatch ? n_suffix + 1 : 0;
+
+      if (edit == kMatch || edit == kRemove || edit == kReplace) {
+        hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str());
+      }
+      if (edit == kAdd || edit == kReplace) {
+        hunk.PushLine('+', right[r_i].c_str());
+      }
+
+      // Advance indices, depending on edit type.
+      l_i += edit != kAdd;
+      r_i += edit != kRemove;
+    }
+
+    if (!hunk.has_edits()) {
+      // We are done. We don't want this hunk.
+      break;
+    }
+
+    hunk.PrintTo(&ss);
+  }
+  return ss.str();
+}
+
+}  // namespace edit_distance
+
+namespace {
+
+// The string representation of the values received in EqFailure() are already
+// escaped. Split them on escaped '\n' boundaries. Leave all other escaped
+// characters the same.
+std::vector<std::string> SplitEscapedString(const std::string& str) {
+  std::vector<std::string> lines;
+  size_t start = 0, end = str.size();
+  if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
+    ++start;
+    --end;
+  }
+  bool escaped = false;
+  for (size_t i = start; i + 1 < end; ++i) {
+    if (escaped) {
+      escaped = false;
+      if (str[i] == 'n') {
+        lines.push_back(str.substr(start, i - start - 1));
+        start = i + 1;
+      }
+    } else {
+      escaped = str[i] == '\\';
+    }
+  }
+  lines.push_back(str.substr(start, end - start));
+  return lines;
+}
+
+}  // namespace
+
 // Constructs and returns the message for an equality assertion
 // (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
 //
@@ -987,41 +1301,53 @@ namespace internal {
 // and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
 // where foo is 5 and bar is 6, we have:
 //
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
+//   lhs_expression: "foo"
+//   rhs_expression: "bar"
+//   lhs_value:      "5"
+//   rhs_value:      "6"
 //
 // The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
 // be inserted into the message.
-AssertionResult EqFailure(const char* expected_expression,
-                          const char* actual_expression,
-                          const String& expected_value,
-                          const String& actual_value,
+AssertionResult EqFailure(const char* lhs_expression,
+                          const char* rhs_expression,
+                          const std::string& lhs_value,
+                          const std::string& rhs_value,
                           bool ignoring_case) {
   Message msg;
-  msg << "Value of: " << actual_expression;
-  if (actual_value != actual_expression) {
-    msg << "\n  Actual: " << actual_value;
+  msg << "      Expected: " << lhs_expression;
+  if (lhs_value != lhs_expression) {
+    msg << "\n      Which is: " << lhs_value;
+  }
+  msg << "\nTo be equal to: " << rhs_expression;
+  if (rhs_value != rhs_expression) {
+    msg << "\n      Which is: " << rhs_value;
   }
 
-  msg << "\nExpected: " << expected_expression;
   if (ignoring_case) {
-    msg << " (ignoring case)";
+    msg << "\nIgnoring case";
   }
-  if (expected_value != expected_expression) {
-    msg << "\nWhich is: " << expected_value;
+
+  if (!lhs_value.empty() && !rhs_value.empty()) {
+    const std::vector<std::string> lhs_lines =
+        SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines =
+        SplitEscapedString(rhs_value);
+    if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
+      msg << "\nWith diff:\n"
+          << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
+    }
   }
 
   return AssertionFailure() << msg;
 }
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-String GetBoolAssertionFailureMessage(const AssertionResult& assertion_result,
-                                      const char* expression_text,
-                                      const char* actual_predicate_value,
-                                      const char* expected_predicate_value) {
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value) {
   const char* actual_message = assertion_result.message();
   Message msg;
   msg << "Value of: " << expression_text
@@ -1108,18 +1434,18 @@ namespace internal {
 
 // The helper function for {ASSERT|EXPECT}_EQ with int or enum
 // arguments.
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            BiggestInt expected,
-                            BiggestInt actual) {
-  if (expected == actual) {
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            BiggestInt lhs,
+                            BiggestInt rhs) {
+  if (lhs == rhs) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
                    false);
 }
 
@@ -1158,34 +1484,34 @@ GTEST_IMPL_CMP_HELPER_(GT, > )
 #undef GTEST_IMPL_CMP_HELPER_
 
 // The helper function for {ASSERT|EXPECT}_STREQ.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const char* expected,
-                               const char* actual) {
-  if (String::CStringEquals(expected, actual)) {
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const char* lhs,
+                               const char* rhs) {
+  if (String::CStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   String::ShowCStringQuoted(expected),
-                   String::ShowCStringQuoted(actual),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
                    false);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                   const char* actual_expression,
-                                   const char* expected,
-                                   const char* actual) {
-  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
+AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const char* lhs,
+                                   const char* rhs) {
+  if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   String::ShowCStringQuoted(expected),
-                   String::ShowCStringQuoted(actual),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
                    true);
 }
 
@@ -1349,7 +1675,7 @@ AssertionResult HRESULTFailureHelper(const char* expr,
   // want inserts expanded.
   const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
                        FORMAT_MESSAGE_IGNORE_INSERTS;
-  const DWORD kBufSize = 4096;  // String::Format can't exceed this length.
+  const DWORD kBufSize = 4096;
   // Gets the system's human readable message string for this HRESULT.
   char error_text[kBufSize] = { '\0' };
   DWORD message_length = ::FormatMessageA(kFlags,
@@ -1359,7 +1685,7 @@ AssertionResult HRESULTFailureHelper(const char* expr,
                                           error_text,  // output buffer
                                           kBufSize,  // buf size
                                           NULL);  // no arguments for inserts
-  // Trims tailing white space (FormatMessage leaves a trailing cr-lf)
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
   for (; message_length && IsSpace(error_text[message_length - 1]);
           --message_length) {
     error_text[message_length - 1] = '\0';
@@ -1367,10 +1693,10 @@ AssertionResult HRESULTFailureHelper(const char* expr,
 
 # endif  // GTEST_OS_WINDOWS_MOBILE
 
-  const String error_hex(String::Format("0x%08X ", hr));
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
   return ::testing::AssertionFailure()
       << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << error_text << "\n";
+      << "  Actual: " << error_hex << " " << error_text << "\n";
 }
 
 }  // namespace
@@ -1394,7 +1720,7 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 // Utility functions for encoding Unicode text (wide strings) in
 // UTF-8.
 
-// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
+// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
 // like this:
 //
 // Code-point length   Encoding
@@ -1427,12 +1753,15 @@ inline UInt32 ChopLowBits(UInt32* bits, int n) {
 // Converts a Unicode code point to a narrow string in UTF-8 encoding.
 // code_point parameter is of type UInt32 because wchar_t may not be
 // wide enough to contain a code point.
-// The output buffer str must containt at least 32 characters.
-// The function returns the address of the output buffer.
 // If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'.
-char* CodePointToUtf8(UInt32 code_point, char* str) {
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(UInt32 code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
   if (code_point <= kMaxCodePoint1) {
     str[1] = '\0';
     str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
@@ -1445,27 +1774,17 @@ char* CodePointToUtf8(UInt32 code_point, char* str) {
     str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
     str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
-  } else if (code_point <= kMaxCodePoint4) {
+  } else {  // code_point <= kMaxCodePoint4
     str[4] = '\0';
     str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
     str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
     str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
-  } else {
-    // The longest string String::Format can produce when invoked
-    // with these parameters is 28 character long (not including
-    // the terminating nul character). We are asking for 32 character
-    // buffer just in case. This is also enough for strncpy to
-    // null-terminate the destination string.
-    posix::StrNCpy(
-        str, String::Format("(Invalid Unicode 0x%X)", code_point).c_str(), 32);
-    str[31] = '\0';  // Makes sure no change in the format to strncpy leaves
-                     // the result unterminated.
   }
   return str;
 }
 
-// The following two functions only make sense if the system
+// The following two functions only make sense if the the system
 // uses UTF-16 for wide string encoding. All supported systems
 // with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
 
@@ -1501,7 +1820,7 @@ inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
 // as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
 // and contains invalid UTF-16 surrogate pairs, values in those pairs
 // will be encoded as individual Unicode characters from Basic Normal Plane.
-String WideStringToUtf8(const wchar_t* str, int num_chars) {
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
   if (num_chars == -1)
     num_chars = static_cast<int>(wcslen(str));
 
@@ -1519,27 +1838,17 @@ String WideStringToUtf8(const wchar_t* str, int num_chars) {
       unicode_code_point = static_cast<UInt32>(str[i]);
     }
 
-    char buffer[32];  // CodePointToUtf8 requires a buffer this big.
-    stream << CodePointToUtf8(unicode_code_point, buffer);
+    stream << CodePointToUtf8(unicode_code_point);
   }
   return StringStreamToString(&stream);
 }
 
-// Converts a wide C string to a String using the UTF-8 encoding.
+// Converts a wide C string to an std::string using the UTF-8 encoding.
 // NULL will be converted to "(null)".
-String String::ShowWideCString(const wchar_t * wide_c_str) {
-  if (wide_c_str == NULL) return String("(null)");
-
-  return String(internal::WideStringToUtf8(wide_c_str, -1).c_str());
-}
+std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+  if (wide_c_str == NULL)  return "(null)";
 
-// Similar to ShowWideCString(), except that this function encloses
-// the converted string in double quotes.
-String String::ShowWideCStringQuoted(const wchar_t* wide_c_str) {
-  if (wide_c_str == NULL) return String("(null)");
-
-  return String::Format("L\"%s\"",
-                        String::ShowWideCString(wide_c_str).c_str());
+  return internal::WideStringToUtf8(wide_c_str, -1);
 }
 
 // Compares two wide C strings.  Returns true iff they have the same
@@ -1557,18 +1866,18 @@ bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
 }
 
 // Helper function for *_STREQ on wide strings.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const wchar_t* expected,
-                               const wchar_t* actual) {
-  if (String::WideCStringEquals(expected, actual)) {
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const wchar_t* lhs,
+                               const wchar_t* rhs) {
+  if (String::WideCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   String::ShowWideCStringQuoted(expected),
-                   String::ShowWideCStringQuoted(actual),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
                    false);
 }
 
@@ -1583,8 +1892,8 @@ AssertionResult CmpHelperSTRNE(const char* s1_expression,
 
   return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
                             << s2_expression << "), actual: "
-                            << String::ShowWideCStringQuoted(s1)
-                            << " vs " << String::ShowWideCStringQuoted(s2);
+                            << PrintToString(s1)
+                            << " vs " << PrintToString(s2);
 }
 
 // Compares two C strings, ignoring case.  Returns true iff they have
@@ -1635,135 +1944,69 @@ bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
 #endif  // OS selector
 }
 
-// Compares this with another String.
-// Returns < 0 if this is less than rhs, 0 if this is equal to rhs, or > 0
-// if this is greater than rhs.
-int String::Compare(const String & rhs) const {
-  const char* const lhs_c_str = c_str();
-  const char* const rhs_c_str = rhs.c_str();
-
-  if (lhs_c_str == NULL) {
-    return rhs_c_str == NULL ? 0 : -1;  // NULL < anything except NULL
-  } else if (rhs_c_str == NULL) {
-    return 1;
-  }
-
-  const size_t shorter_str_len =
-      length() <= rhs.length() ? length() : rhs.length();
-  for (size_t i = 0; i != shorter_str_len; i++) {
-    if (lhs_c_str[i] < rhs_c_str[i]) {
-      return -1;
-    } else if (lhs_c_str[i] > rhs_c_str[i]) {
-      return 1;
-    }
-  }
-  return (length() < rhs.length()) ? -1 :
-      (length() > rhs.length()) ? 1 : 0;
+// Returns true iff str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(
+    const std::string& str, const std::string& suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
 }
 
-// Returns true iff this String ends with the given suffix.  *Any*
-// String is considered to end with a NULL or empty suffix.
-bool String::EndsWith(const char* suffix) const {
-  if (suffix == NULL || CStringEquals(suffix, "")) return true;
-
-  if (c_str() == NULL) return false;
-
-  const size_t this_len = strlen(c_str());
-  const size_t suffix_len = strlen(suffix);
-  return (this_len >= suffix_len) &&
-         CStringEquals(c_str() + this_len - suffix_len, suffix);
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << value;
+  return ss.str();
 }
 
-// Returns true iff this String ends with the given suffix, ignoring case.
-// Any String is considered to end with a NULL or empty suffix.
-bool String::EndsWithCaseInsensitive(const char* suffix) const {
-  if (suffix == NULL || CStringEquals(suffix, "")) return true;
-
-  if (c_str() == NULL) return false;
-
-  const size_t this_len = strlen(c_str());
-  const size_t suffix_len = strlen(suffix);
-  return (this_len >= suffix_len) &&
-         CaseInsensitiveCStringEquals(c_str() + this_len - suffix_len, suffix);
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
 }
 
-// Formats a list of arguments to a String, using the same format
-// spec string as for printf.
-//
-// We do not use the StringPrintf class as it is not universally
-// available.
-//
-// The result is limited to 4096 characters (including the tailing 0).
-// If 4096 characters are not enough to format the input, or if
-// there's an error, "<formatting error or buffer exceeded>" is
-// returned.
-String String::Format(const char * format, ...) {
-  va_list args;
-  va_start(args, format);
-
-  char buffer[4096];
-  const int kBufferSize = sizeof(buffer)/sizeof(buffer[0]);
-
-  // MSVC 8 deprecates vsnprintf(), so we want to suppress warning
-  // 4996 (deprecated function) there.
-#ifdef _MSC_VER  // We are using MSVC.
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4996)  // Temporarily disables warning 4996.
-
-  const int size = vsnprintf(buffer, kBufferSize, format, args);
-
-# pragma warning(pop)           // Restores the warning state.
-#else  // We are not using MSVC.
-  const int size = vsnprintf(buffer, kBufferSize, format, args);
-#endif  // _MSC_VER
-  va_end(args);
-
-  // vsnprintf()'s behavior is not portable.  When the buffer is not
-  // big enough, it returns a negative value in MSVC, and returns the
-  // needed buffer size on Linux.  When there is an output error, it
-  // always returns a negative value.  For simplicity, we lump the two
-  // error cases together.
-  if (size < 0 || size >= kBufferSize) {
-    return String("<formatting error or buffer exceeded>");
-  } else {
-    return String(buffer, size);
-  }
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
 }
 
-// Converts the buffer in a stringstream to a String, converting NUL
+// Converts the buffer in a stringstream to an std::string, converting NUL
 // bytes to "\\0" along the way.
-String StringStreamToString(::std::stringstream* ss) {
+std::string StringStreamToString(::std::stringstream* ss) {
   const ::std::string& str = ss->str();
   const char* const start = str.c_str();
   const char* const end = start + str.length();
 
-  // We need to use a helper stringstream to do this transformation
-  // because String doesn't support push_back().
-  ::std::stringstream helper;
+  std::string result;
+  result.reserve(2 * (end - start));
   for (const char* ch = start; ch != end; ++ch) {
     if (*ch == '\0') {
-      helper << "\\0";  // Replaces NUL with "\\0";
+      result += "\\0";  // Replaces NUL with "\\0";
     } else {
-      helper.put(*ch);
+      result += *ch;
     }
   }
 
-  return String(helper.str().c_str());
+  return result;
 }
 
 // Appends the user-supplied message to the Google-Test-generated message.
-String AppendUserMessage(const String& gtest_msg,
-                         const Message& user_msg) {
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
   // Appends the user message if it's non-empty.
-  const String user_msg_string = user_msg.GetString();
+  const std::string user_msg_string = user_msg.GetString();
   if (user_msg_string.empty()) {
     return gtest_msg;
   }
 
-  Message msg;
-  msg << gtest_msg << "\n" << user_msg_string;
-
-  return msg.GetString();
+  return gtest_msg + "\n" + user_msg_string;
 }
 
 }  // namespace internal
@@ -1811,8 +2054,9 @@ void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
 // Adds a test property to the list. If a property with the same key as the
 // supplied property is already represented, the value of this test_property
 // replaces the old value for that key.
-void TestResult::RecordProperty(const TestProperty& test_property) {
-  if (!ValidateTestProperty(test_property)) {
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
     return;
   }
   internal::MutexLock lock(&test_properites_mutex_);
@@ -1826,21 +2070,94 @@ void TestResult::RecordProperty(const TestProperty& test_property) {
   property_with_matching_key->SetValue(test_property.value());
 }
 
-// Adds a failure if the key is a reserved attribute of Google Test
-// testcase tags.  Returns true if the property is valid.
-bool TestResult::ValidateTestProperty(const TestProperty& test_property) {
-  internal::String key(test_property.key());
-  if (key == "name" || key == "status" || key == "time" || key == "classname") {
-    ADD_FAILURE()
-        << "Reserved key used in RecordProperty(): "
-        << key
-        << " ('name', 'status', 'time', and 'classname' are reserved by "
-        << GTEST_NAME_ << ")";
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "random_seed",
+  "tests",
+  "time",
+  "timestamp"
+};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "tests",
+  "time"
+};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+  "classname",
+  "name",
+  "status",
+  "time",
+  "type_param",
+  "value_param"
+};
+
+template <int kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+bool ValidateTestPropertyName(const std::string& property_name,
+                              const std::vector<std::string>& reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+          reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
     return false;
   }
   return true;
 }
 
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
 // Clears the object.
 void TestResult::Clear() {
   test_part_results_.clear();
@@ -1893,14 +2210,15 @@ int TestResult::test_property_count() const {
 
 // Creates a Test object.
 
-// The c'tor saves the values of all Google Test flags.
+// The c'tor saves the states of all flags.
 Test::Test()
-    : gtest_flag_saver_(new internal::GTestFlagSaver) {
+    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
 }
 
-// The d'tor restores the values of all Google Test flags.
+// The d'tor restores the states of all flags.  The actual work is
+// done by the d'tor of the gtest_flag_saver_ field, and thus not
+// visible here.
 Test::~Test() {
-  delete gtest_flag_saver_;
 }
 
 // Sets up the test fixture.
@@ -1916,12 +2234,12 @@ void Test::TearDown() {
 }
 
 // Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const char* key, const char* value) {
-  UnitTest::GetInstance()->RecordPropertyForCurrentTest(key, value);
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
 }
 
 // Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const char* key, int value) {
+void Test::RecordProperty(const std::string& key, int value) {
   Message value_message;
   value_message << value;
   RecordProperty(key, value_message.GetString().c_str());
@@ -1930,7 +2248,7 @@ void Test::RecordProperty(const char* key, int value) {
 namespace internal {
 
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const String& message) {
+                                    const std::string& message) {
   // This function is a friend of UnitTest and as such has access to
   // AddTestPartResult.
   UnitTest::GetInstance()->AddTestPartResult(
@@ -1938,7 +2256,7 @@ void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
       NULL,  // No info about the source file where the exception occurred.
       -1,    // We have no info on which line caused the exception.
       message,
-      String());  // No stack trace, either.
+      "");   // No stack trace, either.
 }
 
 }  // namespace internal
@@ -1969,8 +2287,8 @@ bool Test::HasSameFixtureClass() {
     const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
 
     if (first_is_TEST || this_is_TEST) {
-      // The user mixed TEST and TEST_F in this test case - we'll tell
-      // him/her how to fix it.
+      // Both TEST and TEST_F appear in same test case, which is incorrect.
+      // Tell the user how to fix this.
 
       // Gets the name of the TEST and the name of the TEST_F.  Note
       // that first_is_TEST and this_is_TEST cannot both be true, as
@@ -1990,8 +2308,8 @@ bool Test::HasSameFixtureClass() {
           << "want to change the TEST to TEST_F or move it to another test\n"
           << "case.";
     } else {
-      // The user defined two fixture classes with the same name in
-      // two namespaces - we'll tell him/her how to fix it.
+      // Two fixture classes with the same name appear in two different
+      // namespaces, which is not allowed. Tell the user how to fix this.
       ADD_FAILURE()
           << "All tests in the same test case must use the same test fixture\n"
           << "class.  However, in test case "
@@ -2015,22 +2333,24 @@ bool Test::HasSameFixtureClass() {
 // function returns its result via an output parameter pointer because VC++
 // prohibits creation of objects with destructors on stack in functions
 // using __try (see error C2712).
-static internal::String* FormatSehExceptionMessage(DWORD exception_code,
-                                                   const char* location) {
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
   Message message;
   message << "SEH exception with code 0x" << std::setbase(16) <<
     exception_code << std::setbase(10) << " thrown in " << location << ".";
 
-  return new internal::String(message.GetString());
+  return new std::string(message.GetString());
 }
 
 #endif  // GTEST_HAS_SEH
 
+namespace internal {
+
 #if GTEST_HAS_EXCEPTIONS
 
 // Adds an "exception thrown" fatal failure to the current test.
-static internal::String FormatCxxExceptionMessage(const char* description,
-                                                  const char* location) {
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
   Message message;
   if (description != NULL) {
     message << "C++ exception with description \"" << description << "\"";
@@ -2042,23 +2362,15 @@ static internal::String FormatCxxExceptionMessage(const char* description,
   return message.GetString();
 }
 
-static internal::String PrintTestPartResultToString(
+static std::string PrintTestPartResultToString(
     const TestPartResult& test_part_result);
 
-// A failed Google Test assertion will throw an exception of this type when
-// GTEST_FLAG(throw_on_failure) is true (if exceptions are enabled).  We
-// derive it from std::runtime_error, which is for errors presumably
-// detectable only at run time.  Since std::runtime_error inherits from
-// std::exception, many testing frameworks know how to extract and print the
-// message inside it.
-class GoogleTestFailureException : public ::std::runtime_error {
- public:
-  explicit GoogleTestFailureException(const TestPartResult& failure)
-      : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
-};
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult& failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
 #endif  // GTEST_HAS_EXCEPTIONS
 
-namespace internal {
 // We put these helper functions in the internal namespace as IBM's xlC
 // compiler rejects the code if they were declared static.
 
@@ -2078,7 +2390,7 @@ Result HandleSehExceptionsInMethodIfSupported(
     // We create the exception message on the heap because VC++ prohibits
     // creation of objects with destructors on stack in functions using __try
     // (see error C2712).
-    internal::String* exception_message = FormatSehExceptionMessage(
+    std::string* exception_message = FormatSehExceptionMessage(
         GetExceptionCode(), location);
     internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
                                              *exception_message);
@@ -2124,9 +2436,10 @@ Result HandleExceptionsInMethodIfSupported(
 #if GTEST_HAS_EXCEPTIONS
     try {
       return HandleSehExceptionsInMethodIfSupported(object, method, location);
-    } catch (const GoogleTestFailureException&) {  // NOLINT
-      // This exception doesn't originate in code under test. It makes no
-      // sense to report it as a test failure.
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
       throw;
     } catch (const std::exception& e) {  // NOLINT
       internal::ReportFailureInUnknownLocation(
@@ -2185,18 +2498,18 @@ bool Test::HasNonfatalFailure() {
 
 // Constructs a TestInfo object. It assumes ownership of the test factory
 // object.
-// TODO(vladl@google.com): Make a_test_case_name and a_name const string&'s
-// to signify they cannot be NULLs.
-TestInfo::TestInfo(const char* a_test_case_name,
-                   const char* a_name,
+TestInfo::TestInfo(const std::string& a_test_case_name,
+                   const std::string& a_name,
                    const char* a_type_param,
                    const char* a_value_param,
+                   internal::CodeLocation a_code_location,
                    internal::TypeId fixture_class_id,
                    internal::TestFactoryBase* factory)
     : test_case_name_(a_test_case_name),
       name_(a_name),
       type_param_(a_type_param ? new std::string(a_type_param) : NULL),
       value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      location_(a_code_location),
       fixture_class_id_(fixture_class_id),
       should_run_(false),
       is_disabled_(false),
@@ -2220,6 +2533,7 @@ namespace internal {
 //                     this is not a typed or a type-parameterized test.
 //   value_param:      text representation of the test's value parameter,
 //                     or NULL if this is not a value-parameterized test.
+//   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
 //   set_up_tc:        pointer to the function that sets up the test case
 //   tear_down_tc:     pointer to the function that tears down the test case
@@ -2227,23 +2541,25 @@ namespace internal {
 //                     The newly created TestInfo instance will assume
 //                     ownership of the factory object.
 TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name, const char* name,
+    const char* test_case_name,
+    const char* name,
     const char* type_param,
     const char* value_param,
+    CodeLocation code_location,
     TypeId fixture_class_id,
     SetUpTestCaseFunc set_up_tc,
     TearDownTestCaseFunc tear_down_tc,
     TestFactoryBase* factory) {
   TestInfo* const test_info =
       new TestInfo(test_case_name, name, type_param, value_param,
-                   fixture_class_id, factory);
+                   code_location, fixture_class_id, factory);
   GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
   return test_info;
 }
 
 #if GTEST_HAS_PARAM_TEST
 void ReportInvalidTestCaseType(const char* test_case_name,
-                               const char* file, int line) {
+                               CodeLocation code_location) {
   Message errors;
   errors
       << "Attempted redefinition of test case " << test_case_name << ".\n"
@@ -2255,13 +2571,44 @@ void ReportInvalidTestCaseType(const char* test_case_name,
       << "probably rename one of the classes to put the tests into different\n"
       << "test cases.";
 
-  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+  fprintf(stderr, "%s %s",
+          FormatFileLocation(code_location.file.c_str(),
+                             code_location.line).c_str(),
           errors.GetString().c_str());
 }
 #endif  // GTEST_HAS_PARAM_TEST
 
 }  // namespace internal
 
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestCase class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+class TestNameIs {
+ public:
+  // Constructor.
+  //
+  // TestNameIs has NO default constructor.
+  explicit TestNameIs(const char* name)
+      : name_(name) {}
+
+  // Returns true iff the test name of test_info matches name_.
+  bool operator()(const TestInfo * test_info) const {
+    return test_info && test_info->name() == name_;
+  }
+
+ private:
+  std::string name_;
+};
+
+}  // namespace
+
 namespace internal {
 
 // This method expands all parameterized tests registered with macros TEST_P
@@ -2336,10 +2683,21 @@ int TestCase::failed_test_count() const {
   return CountIf(test_info_list_, TestFailed);
 }
 
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestCase::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test case.
 int TestCase::disabled_test_count() const {
   return CountIf(test_info_list_, TestDisabled);
 }
 
+// Gets the number of tests to be printed in the XML report.
+int TestCase::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
 // Get the number of tests in this test case that should run.
 int TestCase::test_to_run_count() const {
   return CountIf(test_info_list_, ShouldRunTest);
@@ -2427,6 +2785,7 @@ void TestCase::Run() {
 
 // Clears the results of all tests in this test case.
 void TestCase::ClearResult() {
+  ad_hoc_test_result_.Clear();
   ForEach(test_info_list_, TestInfo::ClearTestResult);
 }
 
@@ -2447,20 +2806,20 @@ void TestCase::UnshuffleTests() {
 //
 // FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
 // FormatCountableNoun(5, "book", "books") returns "5 books".
-static internal::String FormatCountableNoun(int count,
-                                            const char * singular_form,
-                                            const char * plural_form) {
-  return internal::String::Format("%d %s", count,
-                                  count == 1 ? singular_form : plural_form);
+static std::string FormatCountableNoun(int count,
+                                       const char * singular_form,
+                                       const char * plural_form) {
+  return internal::StreamableToString(count) + " " +
+      (count == 1 ? singular_form : plural_form);
 }
 
 // Formats the count of tests.
-static internal::String FormatTestCount(int test_count) {
+static std::string FormatTestCount(int test_count) {
   return FormatCountableNoun(test_count, "test", "tests");
 }
 
 // Formats the count of test cases.
-static internal::String FormatTestCaseCount(int test_case_count) {
+static std::string FormatTestCaseCount(int test_case_count) {
   return FormatCountableNoun(test_case_count, "test case", "test cases");
 }
 
@@ -2480,14 +2839,15 @@ static const char * TestPartResultTypeToString(TestPartResult::Type type) {
 #else
       return "Failure\n";
 #endif
+    default:
+      return "Unknown result type";
   }
-
-  // All cases return, so this is unreachable but GCC doesn't know it
-  abort();
 }
 
-// Prints a TestPartResult to a String.
-static internal::String PrintTestPartResultToString(
+namespace internal {
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
     const TestPartResult& test_part_result) {
   return (Message()
           << internal::FormatFileLocation(test_part_result.file_name(),
@@ -2498,7 +2858,7 @@ static internal::String PrintTestPartResultToString(
 
 // Prints a TestPartResult.
 static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const internal::String& result =
+  const std::string& result =
       PrintTestPartResultToString(test_part_result);
   printf("%s\n", result.c_str());
   fflush(stdout);
@@ -2517,8 +2877,6 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) {
 
 // class PrettyUnitTestResultPrinter
 
-namespace internal {
-
 enum GTestColor {
   COLOR_DEFAULT,
   COLOR_RED,
@@ -2526,7 +2884,8 @@ enum GTestColor {
   COLOR_YELLOW
 };
 
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 // Returns the character attribute for the given color.
 WORD GetColorAttribute(GTestColor color) {
@@ -2570,6 +2929,11 @@ bool ShouldUseColor(bool stdout_is_tty) {
         String::CStringEquals(term, "xterm-color") ||
         String::CStringEquals(term, "xterm-256color") ||
         String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "tmux") ||
+        String::CStringEquals(term, "tmux-256color") ||
+        String::CStringEquals(term, "rxvt-unicode") ||
+        String::CStringEquals(term, "rxvt-unicode-256color") ||
         String::CStringEquals(term, "linux") ||
         String::CStringEquals(term, "cygwin");
     return stdout_is_tty && term_supports_color;
@@ -2593,8 +2957,9 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
-  const bool use_color = false;
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || \
+    GTEST_OS_IOS || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  const bool use_color = AlwaysFalse();
 #else
   static const bool in_color_mode =
       ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
@@ -2608,7 +2973,8 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
     return;
   }
 
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
@@ -2635,6 +3001,11 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_end(args);
 }
 
+// Text printed in Google Test's text output and --gunit_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
 void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
   const char* const type_param = test_info.type_param();
   const char* const value_param = test_info.value_param();
@@ -2642,12 +3013,12 @@ void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
   if (type_param != NULL || value_param != NULL) {
     printf(", where ");
     if (type_param != NULL) {
-      printf("TypeParam = %s", type_param);
+      printf("%s = %s", kTypeParamLabel, type_param);
       if (value_param != NULL)
         printf(" and ");
     }
     if (value_param != NULL) {
-      printf("GetParam() = %s", value_param);
+      printf("%s = %s", kValueParamLabel, value_param);
     }
   }
 }
@@ -2663,24 +3034,22 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
   }
 
   // The following methods override what's in the TestEventListener class.
-  void OnTestProgramStart(const UnitTest & /*unit_test*/) override {}
-  void OnTestIterationStart(const UnitTest &unit_test, int iteration) override;
-  void OnEnvironmentsSetUpStart(const UnitTest &unit_test) override;
-  void OnEnvironmentsSetUpEnd(const UnitTest & /*unit_test*/) override {}
-  void OnTestCaseStart(const TestCase &test_case) override;
-  void OnTestStart(const TestInfo &test_info) override;
-  void OnTestPartResult(const TestPartResult &result) override;
-  void OnTestEnd(const TestInfo &test_info) override;
-  void OnTestCaseEnd(const TestCase &test_case) override;
-  void OnEnvironmentsTearDownStart(const UnitTest &unit_test) override;
-  void OnEnvironmentsTearDownEnd(const UnitTest & /*unit_test*/) override {}
-  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
-  void OnTestProgramEnd(const UnitTest & /*unit_test*/) override {}
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
 
  private:
   static void PrintFailedTests(const UnitTest& unit_test);
-
-  internal::String test_case_name_;
 };
 
   // Fired before each iteration of tests starts.
@@ -2693,7 +3062,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart(
 
   // Prints the filter if it's not *.  This reminds the user that some
   // tests may be skipped.
-  if (!internal::String::CStringEquals(filter, kUniversalFilter)) {
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
     ColoredPrintf(COLOR_YELLOW,
                   "Note: %s filter = %s\n", GTEST_NAME_, filter);
   }
@@ -2727,22 +3096,21 @@ void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
 }
 
 void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
-  test_case_name_ = test_case.name();
-  const internal::String counts =
+  const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
   ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s", counts.c_str(), test_case_name_.c_str());
+  printf("%s from %s", counts.c_str(), test_case.name());
   if (test_case.type_param() == NULL) {
     printf("\n");
   } else {
-    printf(", where TypeParam = %s\n", test_case.type_param());
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
   }
   fflush(stdout);
 }
 
 void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
   ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
-  PrintTestName(test_case_name_.c_str(), test_info.name());
+  PrintTestName(test_info.test_case_name(), test_info.name());
   printf("\n");
   fflush(stdout);
 }
@@ -2765,7 +3133,7 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
   } else {
     ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
   }
-  PrintTestName(test_case_name_.c_str(), test_info.name());
+  PrintTestName(test_info.test_case_name(), test_info.name());
   if (test_info.result()->Failed())
     PrintFullTestCommentIfPresent(test_info);
 
@@ -2781,12 +3149,11 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
 void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
   if (!GTEST_FLAG(print_time)) return;
 
-  test_case_name_ = test_case.name();
-  const internal::String counts =
+  const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
   ColoredPrintf(COLOR_GREEN, "[----------] ");
   printf("%s from %s (%s ms total)\n\n",
-         counts.c_str(), test_case_name_.c_str(),
+         counts.c_str(), test_case.name(),
          internal::StreamableToString(test_case.elapsed_time()).c_str());
   fflush(stdout);
 }
@@ -2847,7 +3214,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
                         num_failures == 1 ? "TEST" : "TESTS");
   }
 
-  int num_disabled = unit_test.disabled_test_count();
+  int num_disabled = unit_test.reportable_disabled_test_count();
   if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
     if (!num_failures) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
@@ -2869,7 +3236,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
 class TestEventRepeater : public TestEventListener {
  public:
   TestEventRepeater() : forwarding_enabled_(true) {}
-  ~TestEventRepeater() override;
+  virtual ~TestEventRepeater();
   void Append(TestEventListener *listener);
   TestEventListener* Release(TestEventListener* listener);
 
@@ -2878,19 +3245,19 @@ class TestEventRepeater : public TestEventListener {
   bool forwarding_enabled() const { return forwarding_enabled_; }
   void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
 
-  void OnTestProgramStart(const UnitTest &unit_test) override;
-  void OnTestIterationStart(const UnitTest &unit_test, int iteration) override;
-  void OnEnvironmentsSetUpStart(const UnitTest &unit_test) override;
-  void OnEnvironmentsSetUpEnd(const UnitTest &unit_test) override;
-  void OnTestCaseStart(const TestCase &test_case) override;
-  void OnTestStart(const TestInfo &test_info) override;
-  void OnTestPartResult(const TestPartResult &result) override;
-  void OnTestEnd(const TestInfo &test_info) override;
-  void OnTestCaseEnd(const TestCase &test_case) override;
-  void OnEnvironmentsTearDownStart(const UnitTest &unit_test) override;
-  void OnEnvironmentsTearDownEnd(const UnitTest &unit_test) override;
-  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
-  void OnTestProgramEnd(const UnitTest &unit_test) override;
+  virtual void OnTestProgramStart(const UnitTest& unit_test);
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& unit_test);
 
  private:
   // Controls whether events will be forwarded to listeners_. Set to false
@@ -2983,7 +3350,7 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
  public:
   explicit XmlUnitTestResultPrinter(const char* output_file);
 
-  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
 
  private:
   // Is c a whitespace character that is normalized to a space character
@@ -3001,18 +3368,27 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // is_attribute is true, the text is meant to appear as an attribute
   // value, and normalizable whitespace is preserved by replacing it
   // with character references.
-  static String EscapeXml(const char* str, bool is_attribute);
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
 
   // Returns the given string with all characters invalid in XML removed.
-  static string RemoveInvalidXmlCharacters(const string& str);
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
 
   // Convenience wrapper around EscapeXml when str is an attribute value.
-  static String EscapeXmlAttribute(const char* str) {
+  static std::string EscapeXmlAttribute(const std::string& str) {
     return EscapeXml(str, true);
   }
 
   // Convenience wrapper around EscapeXml when str is not an attribute value.
-  static String EscapeXmlText(const char* str) { return EscapeXml(str, false); }
+  static std::string EscapeXmlText(const char* str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
 
   // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
   static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
@@ -3023,19 +3399,21 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
                                 const TestInfo& test_info);
 
   // Prints an XML representation of a TestCase object
-  static void PrintXmlTestCase(FILE* out, const TestCase& test_case);
+  static void PrintXmlTestCase(::std::ostream* stream,
+                               const TestCase& test_case);
 
   // Prints an XML summary of unit_test to output stream out.
-  static void PrintXmlUnitTest(FILE* out, const UnitTest& unit_test);
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
 
   // Produces a string representing the test properties in a result as space
   // delimited XML attributes based on the property key="value" pairs.
-  // When the String is not empty, it includes a space at the beginning,
+  // When the std::string is not empty, it includes a space at the beginning,
   // to delimit this attribute from prior attributes.
-  static String TestPropertiesAsXmlAttributes(const TestResult& result);
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
 
   // The output file.
-  const String output_file_;
+  const std::string output_file_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
 };
@@ -3077,7 +3455,9 @@ void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
     fflush(stderr);
     exit(EXIT_FAILURE);
   }
-  PrintXmlUnitTest(xmlout, unit_test);
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
   fclose(xmlout);
 }
 
@@ -3093,42 +3473,43 @@ void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
 // most invalid characters can be retained using character references.
 // TODO(wan): It might be nice to have a minimally invasive, human-readable
 // escaping scheme for invalid characters, rather than dropping them.
-String XmlUnitTestResultPrinter::EscapeXml(const char* str, bool is_attribute) {
+std::string XmlUnitTestResultPrinter::EscapeXml(
+    const std::string& str, bool is_attribute) {
   Message m;
 
-  if (str != NULL) {
-    for (const char* src = str; *src; ++src) {
-      switch (*src) {
-        case '<':
-          m << "&lt;";
-          break;
-        case '>':
-          m << "&gt;";
-          break;
-        case '&':
-          m << "&amp;";
-          break;
-        case '\'':
-          if (is_attribute)
-            m << "&apos;";
-          else
-            m << '\'';
-          break;
-        case '"':
-          if (is_attribute)
-            m << "&quot;";
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<':
+        m << "&lt;";
+        break;
+      case '>':
+        m << "&gt;";
+        break;
+      case '&':
+        m << "&amp;";
+        break;
+      case '\'':
+        if (is_attribute)
+          m << "&apos;";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << "&quot;";
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(ch)) {
+          if (is_attribute && IsNormalizableWhitespace(ch))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
           else
-            m << '"';
-          break;
-        default:
-          if (IsValidXmlCharacter(*src)) {
-            if (is_attribute && IsNormalizableWhitespace(*src))
-              m << String::Format("&#x%02X;", unsigned(*src));
-            else
-              m << *src;
-          }
-          break;
-      }
+            m << ch;
+        }
+        break;
     }
   }
 
@@ -3138,10 +3519,11 @@ String XmlUnitTestResultPrinter::EscapeXml(const char* str, bool is_attribute) {
 // Returns the given string with all characters invalid in XML removed.
 // Currently invalid characters are dropped from the string. An
 // alternative is to replace them with certain characters such as . or ?.
-string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(const string& str) {
-  string output;
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string& str) {
+  std::string output;
   output.reserve(str.size());
-  for (string::const_iterator it = str.begin(); it != str.end(); ++it)
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
     if (IsValidXmlCharacter(*it))
       output.push_back(*it);
 
@@ -3167,10 +3549,41 @@ string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(const string& str) {
 // Formats the given time in milliseconds as seconds.
 std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
   ::std::stringstream ss;
-  ss << ms/1000.0;
+  ss << (static_cast<double>(ms) * 1e-3);
   return ss.str();
 }
 
+static bool PortableLocaltime(time_t seconds, struct tm* out) {
+#if defined(_MSC_VER)
+  return localtime_s(out, &seconds) == 0;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+  // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
+  // Windows' localtime(), which has a thread-local tm buffer.
+  struct tm* tm_ptr = localtime(&seconds);  // NOLINT
+  if (tm_ptr == NULL)
+    return false;
+  *out = *tm_ptr;
+  return true;
+#else
+  return localtime_r(&seconds, out) != NULL;
+#endif
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec);
+}
+
 // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
 void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
                                                      const char* data) {
@@ -3191,45 +3604,63 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
   *stream << "]]>";
 }
 
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
 // Prints an XML representation of a TestInfo object.
 // TODO(wan): There is also value in printing properties with the plain printer.
 void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
                                                  const char* test_case_name,
                                                  const TestInfo& test_info) {
   const TestResult& result = *test_info.result();
-  *stream << "    <testcase name=\""
-          << EscapeXmlAttribute(test_info.name()).c_str() << "\"";
+  const std::string kTestcase = "testcase";
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
 
   if (test_info.value_param() != NULL) {
-    *stream << " value_param=\"" << EscapeXmlAttribute(test_info.value_param())
-            << "\"";
+    OutputXmlAttribute(stream, kTestcase, "value_param",
+                       test_info.value_param());
   }
   if (test_info.type_param() != NULL) {
-    *stream << " type_param=\"" << EscapeXmlAttribute(test_info.type_param())
-            << "\"";
+    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
   }
 
-  *stream << " status=\""
-          << (test_info.should_run() ? "run" : "notrun")
-          << "\" time=\""
-          << FormatTimeInMillisAsSeconds(result.elapsed_time())
-          << "\" classname=\"" << EscapeXmlAttribute(test_case_name).c_str()
-          << "\"" << TestPropertiesAsXmlAttributes(result).c_str();
+  OutputXmlAttribute(stream, kTestcase, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestcase, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
+  *stream << TestPropertiesAsXmlAttributes(result);
 
   int failures = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
     const TestPartResult& part = result.GetTestPartResult(i);
     if (part.failed()) {
-      if (++failures == 1)
+      if (++failures == 1) {
         *stream << ">\n";
-      *stream << "      <failure message=\""
-              << EscapeXmlAttribute(part.summary()).c_str()
-              << "\" type=\"\">";
+      }
       const string location = internal::FormatCompilerIndependentFileLocation(
           part.file_name(), part.line_number());
-      const string message = location + "\n" + part.message();
-      OutputXmlCDataSection(stream,
-                            RemoveInvalidXmlCharacters(message).c_str());
+      const string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(summary.c_str())
+              << "\" type=\"\">";
+      const string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
       *stream << "</failure>\n";
     }
   }
@@ -3241,49 +3672,73 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
 }
 
 // Prints an XML representation of a TestCase object
-void XmlUnitTestResultPrinter::PrintXmlTestCase(FILE* out,
+void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
                                                 const TestCase& test_case) {
-  fprintf(out,
-          "  <testsuite name=\"%s\" tests=\"%d\" failures=\"%d\" "
-          "disabled=\"%d\" ",
-          EscapeXmlAttribute(test_case.name()).c_str(),
-          test_case.total_test_count(),
-          test_case.failed_test_count(),
-          test_case.disabled_test_count());
-  fprintf(out,
-          "errors=\"0\" time=\"%s\">\n",
-          FormatTimeInMillisAsSeconds(test_case.elapsed_time()).c_str());
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_case.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "failures",
+                     StreamableToString(test_case.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuite, "disabled",
+      StreamableToString(test_case.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuite, "time",
+                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
+          << ">\n";
+
   for (int i = 0; i < test_case.total_test_count(); ++i) {
-    ::std::stringstream stream;
-    OutputXmlTestInfo(&stream, test_case.name(), *test_case.GetTestInfo(i));
-    fprintf(out, "%s", StringStreamToString(&stream).c_str());
+    if (test_case.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
   }
-  fprintf(out, "  </testsuite>\n");
+  *stream << "  </" << kTestsuite << ">\n";
 }
 
 // Prints an XML summary of unit_test to output stream out.
-void XmlUnitTestResultPrinter::PrintXmlUnitTest(FILE* out,
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
                                                 const UnitTest& unit_test) {
-  fprintf(out, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
-  fprintf(out,
-          "<testsuites tests=\"%d\" failures=\"%d\" disabled=\"%d\" "
-          "errors=\"0\" time=\"%s\" ",
-          unit_test.total_test_count(),
-          unit_test.failed_test_count(),
-          unit_test.disabled_test_count(),
-          FormatTimeInMillisAsSeconds(unit_test.elapsed_time()).c_str());
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+
   if (GTEST_FLAG(shuffle)) {
-    fprintf(out, "random_seed=\"%d\" ", unit_test.random_seed());
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
+      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
   }
-  fprintf(out, "name=\"AllTests\">\n");
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i)
-    PrintXmlTestCase(out, *unit_test.GetTestCase(i));
-  fprintf(out, "</testsuites>\n");
+  *stream << "</" << kTestsuites << ">\n";
 }
 
 // Produces a string representing the test properties in a result as space
 // delimited XML attributes based on the property key="value" pairs.
-String XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
     const TestResult& result) {
   Message attributes;
   for (int i = 0; i < result.test_property_count(); ++i) {
@@ -3298,114 +3753,6 @@ String XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
 
 #if GTEST_CAN_STREAM_RESULTS_
 
-// Streams test results to the given port on the given host machine.
-class StreamingListener : public EmptyTestEventListener {
- public:
-  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static string UrlEncode(const char* str);
-
-  StreamingListener(const string& host, const string& port)
-      : sockfd_(-1), host_name_(host), port_num_(port) {
-    MakeConnection();
-    Send("gtest_streaming_protocol_version=1.0\n");
-  }
-
-  ~StreamingListener() override {
-    if (sockfd_ != -1)
-      CloseConnection();
-  }
-
-  void OnTestProgramStart(const UnitTest & /* unit_test */) override {
-    Send("event=TestProgramStart\n");
-  }
-
-  void OnTestProgramEnd(const UnitTest &unit_test) override {
-    // Note that Google Test current only report elapsed time for each
-    // test iteration, not for the entire test program.
-    Send(String::Format("event=TestProgramEnd&passed=%d\n",
-                        unit_test.Passed()));
-
-    // Notify the streaming server to stop.
-    CloseConnection();
-  }
-
-  void OnTestIterationStart(const UnitTest & /* unit_test */,
-                            int iteration) override {
-    Send(String::Format("event=TestIterationStart&iteration=%d\n",
-                        iteration));
-  }
-
-  void OnTestIterationEnd(const UnitTest &unit_test,
-                          int /* iteration */) override {
-    Send(String::Format("event=TestIterationEnd&passed=%d&elapsed_time=%sms\n",
-                        unit_test.Passed(),
-                        StreamableToString(unit_test.elapsed_time()).c_str()));
-  }
-
-  void OnTestCaseStart(const TestCase &test_case) override {
-    Send(String::Format("event=TestCaseStart&name=%s\n", test_case.name()));
-  }
-
-  void OnTestCaseEnd(const TestCase &test_case) override {
-    Send(String::Format("event=TestCaseEnd&passed=%d&elapsed_time=%sms\n",
-                        test_case.Passed(),
-                        StreamableToString(test_case.elapsed_time()).c_str()));
-  }
-
-  void OnTestStart(const TestInfo &test_info) override {
-    Send(String::Format("event=TestStart&name=%s\n", test_info.name()));
-  }
-
-  void OnTestEnd(const TestInfo &test_info) override {
-    Send(String::Format(
-        "event=TestEnd&passed=%d&elapsed_time=%sms\n",
-        (test_info.result())->Passed(),
-        StreamableToString((test_info.result())->elapsed_time()).c_str()));
-  }
-
-  void OnTestPartResult(const TestPartResult &test_part_result) override {
-    const char* file_name = test_part_result.file_name();
-    if (file_name == NULL)
-      file_name = "";
-    Send(String::Format("event=TestPartResult&file=%s&line=%d&message=",
-                        UrlEncode(file_name).c_str(),
-                        test_part_result.line_number()));
-    Send(UrlEncode(test_part_result.message()) + "\n");
-  }
-
- private:
-  // Creates a client socket and connects to the server.
-  void MakeConnection();
-
-  // Closes the socket.
-  void CloseConnection() {
-    GTEST_CHECK_(sockfd_ != -1)
-        << "CloseConnection() can be called only when there is a connection.";
-
-    close(sockfd_);
-    sockfd_ = -1;
-  }
-
-  // Sends a string to the socket.
-  void Send(const string& message) {
-    GTEST_CHECK_(sockfd_ != -1)
-        << "Send() can be called only when there is a connection.";
-
-    const int len = static_cast<int>(message.length());
-    if (write(sockfd_, message.c_str(), len) != len) {
-      GTEST_LOG_(WARNING)
-          << "stream_result_to: failed to stream to "
-          << host_name_ << ":" << port_num_;
-    }
-  }
-
-  int sockfd_;   // socket file descriptor
-  const string host_name_;
-  const string port_num_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
-};  // class StreamingListener
-
 // Checks if str contains '=', '&', '%' or '\n' characters. If yes,
 // replaces them by "%xx" where xx is their hexadecimal value. For
 // example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
@@ -3420,7 +3767,7 @@ string StreamingListener::UrlEncode(const char* str) {
       case '=':
       case '&':
       case '\n':
-        result.append(String::Format("%%%02x", static_cast<unsigned char>(ch)));
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
         break;
       default:
         result.push_back(ch);
@@ -3430,7 +3777,7 @@ string StreamingListener::UrlEncode(const char* str) {
   return result;
 }
 
-void StreamingListener::MakeConnection() {
+void StreamingListener::SocketWriter::MakeConnection() {
   GTEST_CHECK_(sockfd_ == -1)
       << "MakeConnection() can't be called when there is already a connection.";
 
@@ -3478,8 +3825,8 @@ void StreamingListener::MakeConnection() {
 
 // Pushes the given source file location and message onto a per-thread
 // trace stack maintained by Google Test.
-// L < UnitTest::mutex_
-ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) {
+ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
   TraceInfo trace;
   trace.file = file;
   trace.line = line;
@@ -3489,34 +3836,52 @@ ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) {
 }
 
 // Pops the info pushed by the c'tor.
-// L < UnitTest::mutex_
-ScopedTrace::~ScopedTrace() {
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
   UnitTest::GetInstance()->PopGTestTrace();
 }
 
 
 // class OsStackTraceGetter
 
-// Returns the current OS stack trace as a String.  Parameters:
-//
-//   max_depth  - the maximum number of stack frames to be included
-//                in the trace.
-//   skip_count - the number of top frames to be skipped; doesn't count
-//                against max_depth.
-//
-// L < mutex_
-// We use "L < mutex_" to denote that the function may acquire mutex_.
-String OsStackTraceGetter::CurrentStackTrace(int, int) {
-  return String("");
-}
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
 
-// L < mutex_
-void OsStackTraceGetter::UponLeavingGTest() {
+string OsStackTraceGetter::CurrentStackTrace(int /*max_depth*/,
+                                             int /*skip_count*/) {
+  return "";
 }
 
-const char* const
-OsStackTraceGetter::kElidedFramesMarker =
-    "... " GTEST_NAME_ " internal frames ...";
+void OsStackTraceGetter::UponLeavingGTest() {}
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(premature_exit_filepath) {
+    // If a path to the premature-exit file is specified...
+    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
+      remove(premature_exit_filepath_);
+    }
+  }
+
+ private:
+  const char* const premature_exit_filepath_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+};
 
 }  // namespace internal
 
@@ -3604,7 +3969,7 @@ void TestEventListeners::SuppressEventForwarding() {
 // We don't protect this under mutex_ as a user is not supposed to
 // call this before main() starts, from which point on the return
 // value will never change.
-UnitTest * UnitTest::GetInstance() {
+UnitTest* UnitTest::GetInstance() {
   // When compiled with MSVC 7.1 in optimized mode, destroying the
   // UnitTest object upon exiting the program messes up the exit code,
   // causing successful tests to appear failed.  We have to use a
@@ -3654,17 +4019,33 @@ int UnitTest::successful_test_count() const {
 // Gets the number of failed tests.
 int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
 
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
 // Gets the number of disabled tests.
 int UnitTest::disabled_test_count() const {
   return impl()->disabled_test_count();
 }
 
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
 // Gets the number of all tests.
 int UnitTest::total_test_count() const { return impl()->total_test_count(); }
 
 // Gets the number of tests that should run.
 int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
 
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+    return impl()->start_timestamp();
+}
+
 // Gets the elapsed time, in milliseconds.
 internal::TimeInMillis UnitTest::elapsed_time() const {
   return impl()->elapsed_time();
@@ -3683,6 +4064,12 @@ const TestCase* UnitTest::GetTestCase(int i) const {
   return impl()->GetTestCase(i);
 }
 
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test cases.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
 // Gets the i-th test case among all the test cases. i can range from 0 to
 // total_test_case_count() - 1. If i is not in that range, returns NULL.
 TestCase* UnitTest::GetMutableTestCase(int i) {
@@ -3718,12 +4105,12 @@ Environment* UnitTest::AddEnvironment(Environment* env) {
 // assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
 // this to report their results.  The user code should use the
 // assertion macros instead of calling this directly.
-// L < mutex_
-void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
-                                 const char* file_name,
-                                 int line_number,
-                                 const internal::String& message,
-                                 const internal::String& os_stack_trace) {
+void UnitTest::AddTestPartResult(
+    TestPartResult::Type result_type,
+    const char* file_name,
+    int line_number,
+    const std::string& message,
+    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
   Message msg;
   msg << message;
 
@@ -3756,17 +4143,21 @@ void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
     // with another testing framework) and specify the former on the
     // command line for debugging.
     if (GTEST_FLAG(break_on_failure)) {
-#if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
       // Using DebugBreak on Windows allows gtest to still break into a debugger
       // when a failure happens and both the --gtest_break_on_failure and
       // the --gtest_catch_exceptions flags are specified.
       DebugBreak();
 #else
-      abort();
+      // Dereference NULL through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: Symbian doesn't implement abort() well, and some debuggers
+      // don't correctly trap abort().
+      *static_cast<volatile int*>(NULL) = 1;
 #endif  // GTEST_OS_WINDOWS
     } else if (GTEST_FLAG(throw_on_failure)) {
 #if GTEST_HAS_EXCEPTIONS
-      throw GoogleTestFailureException(result);
+      throw internal::GoogleTestFailureException(result);
 #else
       // We cannot call abort() as it generates a pop-up in debug mode
       // that cannot be suppressed in VC 7.1 or below.
@@ -3776,12 +4167,14 @@ void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
   }
 }
 
-// Creates and adds a property to the current TestResult. If a property matching
-// the supplied value already exists, updates its value instead.
-void UnitTest::RecordPropertyForCurrentTest(const char* key,
-                                            const char* value) {
-  const TestProperty test_property(key, value);
-  impl_->current_test_result()->RecordProperty(test_property);
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+// from SetUpTestCase or TearDownTestCase, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
+  impl_->RecordProperty(TestProperty(key, value));
 }
 
 // Runs all tests in this UnitTest object and prints the result.
@@ -3790,21 +4183,45 @@ void UnitTest::RecordPropertyForCurrentTest(const char* key,
 // We don't protect this under mutex_, as we only support calling it
 // from the main thread.
 int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process ?
+      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
   // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
   // used for the duration of the program.
   impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
 
 #if GTEST_HAS_SEH
-  const bool in_death_test_child_process =
-      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
-
   // Either the user wants Google Test to catch exceptions thrown by the
   // tests or this is executing in the context of death test child
   // process. In either case the user does not want to see pop-up dialogs
   // about crashes - they are expected.
   if (impl()->catch_exceptions() || in_death_test_child_process) {
-
-# if !GTEST_OS_WINDOWS_MOBILE
+# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
     // SetErrorMode doesn't exist on CE.
     SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
                  SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
@@ -3834,7 +4251,6 @@ int UnitTest::Run() {
           0x0,                                    // Clear the following flags:
           _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
 # endif
-
   }
 #endif  // GTEST_HAS_SEH
 
@@ -3852,16 +4268,16 @@ const char* UnitTest::original_working_dir() const {
 
 // Returns the TestCase object for the test that's currently running,
 // or NULL if no test is running.
-// L < mutex_
-const TestCase* UnitTest::current_test_case() const {
+const TestCase* UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   return impl_->current_test_case();
 }
 
 // Returns the TestInfo object for the test that's currently running,
 // or NULL if no test is running.
-// L < mutex_
-const TestInfo* UnitTest::current_test_info() const {
+const TestInfo* UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   return impl_->current_test_info();
 }
@@ -3872,9 +4288,9 @@ int UnitTest::random_seed() const { return impl_->random_seed(); }
 #if GTEST_HAS_PARAM_TEST
 // Returns ParameterizedTestCaseRegistry object used to keep track of
 // value-parameterized tests and instantiate and register them.
-// L < mutex_
 internal::ParameterizedTestCaseRegistry&
-    UnitTest::parameterized_test_registry() {
+    UnitTest::parameterized_test_registry()
+        GTEST_LOCK_EXCLUDED_(mutex_) {
   return impl_->parameterized_test_registry();
 }
 #endif  // GTEST_HAS_PARAM_TEST
@@ -3891,15 +4307,15 @@ UnitTest::~UnitTest() {
 
 // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
 // Google Test trace stack.
-// L < mutex_
-void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) {
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   impl_->gtest_trace_stack().push_back(trace);
 }
 
 // Pops a trace from the per-thread Google Test trace stack.
-// L < mutex_
-void UnitTest::PopGTestTrace() {
+void UnitTest::PopGTestTrace()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   impl_->gtest_trace_stack().pop_back();
 }
@@ -3908,17 +4324,10 @@ namespace internal {
 
 UnitTestImpl::UnitTestImpl(UnitTest* parent)
     : parent_(parent),
-#ifdef _MSC_VER
-# pragma warning(push)                    // Saves the current warning state.
-# pragma warning(disable:4355)            // Temporarily disables warning 4355
-                                         // (using this in initializer).
+      GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
       default_global_test_part_result_reporter_(this),
       default_per_thread_test_part_result_reporter_(this),
-# pragma warning(pop)                     // Restores the warning state again.
-#else
-      default_global_test_part_result_reporter_(this),
-      default_per_thread_test_part_result_reporter_(this),
-#endif  // _MSC_VER
+      GTEST_DISABLE_MSC_WARNINGS_POP_()
       global_test_part_result_repoter_(
           &default_global_test_part_result_reporter_),
       per_thread_test_part_result_reporter_(
@@ -3935,9 +4344,9 @@ UnitTestImpl::UnitTestImpl(UnitTest* parent)
       post_flag_parse_init_performed_(false),
       random_seed_(0),  // Will be overridden by the flag before first use.
       random_(0),  // Will be reseeded before first use.
+      start_timestamp_(0),
       elapsed_time_(0),
 #if GTEST_HAS_DEATH_TEST
-      internal_run_death_test_flag_(NULL),
       death_test_factory_(new DefaultDeathTestFactory),
 #endif
       // Will be overridden by the flag before first use.
@@ -3955,6 +4364,28 @@ UnitTestImpl::~UnitTestImpl() {
   delete os_stack_trace_getter_;
 }
 
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test case's ad_hoc_test_result when invoke
+// from SetUpTestCase/TearDownTestCase, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+  std::string xml_element;
+  TestResult* test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != NULL) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_case_ != NULL) {
+    xml_element = "testsuite";
+    test_result = &(current_test_case_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
 #if GTEST_HAS_DEATH_TEST
 // Disables event forwarding if the control is currently in a death test
 // subprocess. Must not be called before InitGoogleTest.
@@ -3967,7 +4398,7 @@ void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
 // Initializes event listeners performing XML output as specified by
 // UnitTestOptions. Must not be called before InitGoogleTest.
 void UnitTestImpl::ConfigureXmlOutput() {
-  const String& output_format = UnitTestOptions::GetOutputFormat();
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
   if (output_format == "xml") {
     listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
         UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
@@ -3979,13 +4410,13 @@ void UnitTestImpl::ConfigureXmlOutput() {
 }
 
 #if GTEST_CAN_STREAM_RESULTS_
-// Initializes event listeners for streaming test results in String form.
+// Initializes event listeners for streaming test results in string form.
 // Must not be called before InitGoogleTest.
 void UnitTestImpl::ConfigureStreamingOutput() {
-  const string& target = GTEST_FLAG(stream_result_to);
+  const std::string& target = GTEST_FLAG(stream_result_to);
   if (!target.empty()) {
     const size_t pos = target.find(':');
-    if (pos != string::npos) {
+    if (pos != std::string::npos) {
       listeners()->Append(new StreamingListener(target.substr(0, pos),
                                                 target.substr(pos+1)));
     } else {
@@ -4007,6 +4438,11 @@ void UnitTestImpl::PostFlagParsingInit() {
   if (!post_flag_parse_init_performed_) {
     post_flag_parse_init_performed_ = true;
 
+#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+    // Register to send notifications about key process state changes.
+    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
+#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+
 #if GTEST_HAS_DEATH_TEST
     InitDeathTestSubprocessControlInfo();
     SuppressTestEventsIfInSubprocess();
@@ -4039,7 +4475,7 @@ void UnitTestImpl::PostFlagParsingInit() {
 class TestCaseNameIs {
  public:
   // Constructor.
-  explicit TestCaseNameIs(const String& name)
+  explicit TestCaseNameIs(const std::string& name)
       : name_(name) {}
 
   // Returns true iff the name of test_case matches name_.
@@ -4048,7 +4484,7 @@ class TestCaseNameIs {
   }
 
  private:
-  String name_;
+  std::string name_;
 };
 
 // Finds and returns a TestCase with the given name.  If one doesn't
@@ -4080,7 +4516,7 @@ TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
       new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
 
   // Is this a death test case?
-  if (internal::UnitTestOptions::MatchesFilter(String(test_case_name),
+  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
                                                kDeathTestCaseFilter)) {
     // Yes.  Inserts the test case after the last death test case
     // defined so far.  This only works when the test cases haven't
@@ -4140,6 +4576,11 @@ bool UnitTestImpl::RunAllTests() {
 
 #if GTEST_HAS_DEATH_TEST
   in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+  if (in_subprocess_for_death_test) {
+    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
+  }
+# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 #endif  // GTEST_HAS_DEATH_TEST
 
   const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
@@ -4166,6 +4607,7 @@ bool UnitTestImpl::RunAllTests() {
 
   TestEventListener* repeater = listeners()->repeater();
 
+  start_timestamp_ = GetTimeInMillis();
   repeater->OnTestProgramStart(*parent_);
 
   // How many times to repeat the tests?  We don't want to repeat them
@@ -4358,12 +4800,12 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
   int num_selected_tests = 0;
   for (size_t i = 0; i < test_cases_.size(); i++) {
     TestCase* const test_case = test_cases_[i];
-    const String &test_case_name = test_case->name();
+    const std::string &test_case_name = test_case->name();
     test_case->set_should_run(false);
 
     for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
       TestInfo* const test_info = test_case->test_info_list()[j];
-      const String test_name(test_info->name());
+      const std::string test_name(test_info->name());
       // A test is disabled if test case name or test name matches
       // kDisableTestFilter.
       const bool is_disabled =
@@ -4397,8 +4839,33 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
   return num_selected_tests;
 }
 
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+  if (str != NULL) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
 // Prints the names of the tests matching the user-specified filter flag.
 void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
   for (size_t i = 0; i < test_cases_.size(); i++) {
     const TestCase* const test_case = test_cases_[i];
     bool printed_test_case_name = false;
@@ -4409,9 +4876,23 @@ void UnitTestImpl::ListTestsMatchingFilter() {
       if (test_info->matches_filter_) {
         if (!printed_test_case_name) {
           printed_test_case_name = true;
-          printf("%s.\n", test_case->name());
+          printf("%s.", test_case->name());
+          if (test_case->type_param() != NULL) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != NULL) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
         }
-        printf("  %s\n", test_info->name());
+        printf("\n");
       }
     }
   }
@@ -4436,7 +4917,11 @@ void UnitTestImpl::set_os_stack_trace_getter(
 // getter, and returns it.
 OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
   if (os_stack_trace_getter_ == NULL) {
+#ifdef GTEST_OS_STACK_TRACE_GETTER_
+    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
+#else
     os_stack_trace_getter_ = new OsStackTraceGetter;
+#endif  // GTEST_OS_STACK_TRACE_GETTER_
   }
 
   return os_stack_trace_getter_;
@@ -4475,7 +4960,7 @@ void UnitTestImpl::UnshuffleTests() {
   }
 }
 
-// Returns the current OS stack trace as a String.
+// Returns the current OS stack trace as an std::string.
 //
 // The maximum number of stack frames to be included is specified by
 // the gtest_stack_trace_depth flag.  The skip_count parameter
@@ -4485,8 +4970,8 @@ void UnitTestImpl::UnshuffleTests() {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-String GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
-                                       int skip_count) {
+std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+                                            int skip_count) {
   // We pass skip_count + 1 to skip this wrapper function in addition
   // to what the user really wants to skip.
   return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
@@ -4534,7 +5019,7 @@ const char* ParseFlagValue(const char* str,
   if (str == NULL || flag == NULL) return NULL;
 
   // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
-  const String flag_str = String::Format("--%s%s", GTEST_FLAG_PREFIX_, flag);
+  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
   const size_t flag_len = flag_str.length();
   if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
 
@@ -4599,7 +5084,7 @@ bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, String* value) {
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
 
@@ -4651,7 +5136,7 @@ static void PrintColorEncoded(const char* str) {
       return;
     }
 
-    ColoredPrintf(color, "%s", String(str, p - str).c_str());
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
 
     const char ch = p[1];
     str = p + 2;
@@ -4735,48 +5220,89 @@ static const char kColorEncodedHelpMessage[] =
 "(not one in your own code or tests), please report it to\n"
 "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
+bool ParseGoogleTestFlag(const char* const arg) {
+  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                       &GTEST_FLAG(also_run_disabled_tests)) ||
+      ParseBoolFlag(arg, kBreakOnFailureFlag,
+                    &GTEST_FLAG(break_on_failure)) ||
+      ParseBoolFlag(arg, kCatchExceptionsFlag,
+                    &GTEST_FLAG(catch_exceptions)) ||
+      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+      ParseStringFlag(arg, kDeathTestStyleFlag,
+                      &GTEST_FLAG(death_test_style)) ||
+      ParseBoolFlag(arg, kDeathTestUseFork,
+                    &GTEST_FLAG(death_test_use_fork)) ||
+      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+      ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                      &GTEST_FLAG(internal_run_death_test)) ||
+      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+      ParseInt32Flag(arg, kStackTraceDepthFlag,
+                     &GTEST_FLAG(stack_trace_depth)) ||
+      ParseStringFlag(arg, kStreamResultToFlag,
+                      &GTEST_FLAG(stream_result_to)) ||
+      ParseBoolFlag(arg, kThrowOnFailureFlag,
+                    &GTEST_FLAG(throw_on_failure));
+}
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+void LoadFlagsFromFile(const std::string& path) {
+  FILE* flagfile = posix::FOpen(path.c_str(), "r");
+  if (!flagfile) {
+    fprintf(stderr,
+            "Unable to open file \"%s\"\n",
+            GTEST_FLAG(flagfile).c_str());
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+  std::string contents(ReadEntireFile(flagfile));
+  posix::FClose(flagfile);
+  std::vector<std::string> lines;
+  SplitString(contents, '\n', &lines);
+  for (size_t i = 0; i < lines.size(); ++i) {
+    if (lines[i].empty())
+      continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str()))
+      g_help_flag = true;
+  }
+}
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.  The type parameter CharType can be
 // instantiated to either char or wchar_t.
 template <typename CharType>
 void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
   for (int i = 1; i < *argc; i++) {
-    const String arg_string = StreamableToString(argv[i]);
+    const std::string arg_string = StreamableToString(argv[i]);
     const char* const arg = arg_string.c_str();
 
     using internal::ParseBoolFlag;
     using internal::ParseInt32Flag;
     using internal::ParseStringFlag;
 
-    // Do we see a Google Test flag?
-    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                      &GTEST_FLAG(also_run_disabled_tests)) ||
-        ParseBoolFlag(arg, kBreakOnFailureFlag,
-                      &GTEST_FLAG(break_on_failure)) ||
-        ParseBoolFlag(arg, kCatchExceptionsFlag,
-                      &GTEST_FLAG(catch_exceptions)) ||
-        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-        ParseStringFlag(arg, kDeathTestStyleFlag,
-                        &GTEST_FLAG(death_test_style)) ||
-        ParseBoolFlag(arg, kDeathTestUseFork,
-                      &GTEST_FLAG(death_test_use_fork)) ||
-        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-        ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                        &GTEST_FLAG(internal_run_death_test)) ||
-        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-        ParseInt32Flag(arg, kStackTraceDepthFlag,
-                       &GTEST_FLAG(stack_trace_depth)) ||
-        ParseStringFlag(arg, kStreamResultToFlag,
-                        &GTEST_FLAG(stream_result_to)) ||
-        ParseBoolFlag(arg, kThrowOnFailureFlag,
-                      &GTEST_FLAG(throw_on_failure))
-        ) {
-      // Yes.  Shift the remainder of the argv list left by one.  Note
+    bool remove_flag = false;
+    if (ParseGoogleTestFlag(arg)) {
+      remove_flag = true;
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
+      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+      remove_flag = true;
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+
+    if (remove_flag) {
+      // Shift the remainder of the argv list left by one.  Note
       // that argv has (*argc + 1) elements, the last one always being
       // NULL.  The following loop moves the trailing NULL element as
       // well.
@@ -4790,12 +5316,6 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
       // We also need to decrement the iterator as we just removed
       // an element.
       i--;
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
-      // Both help flag and unrecognized Google Test flags (excluding
-      // internal ones) trigger help display.
-      g_help_flag = true;
     }
   }
 
@@ -4822,24 +5342,16 @@ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
 // wchar_t.
 template <typename CharType>
 void InitGoogleTestImpl(int* argc, CharType** argv) {
-  g_init_gtest_count++;
-
   // We don't want to run the initialization code twice.
-  if (g_init_gtest_count != 1) return;
+  if (GTestIsInitialized()) return;
 
   if (*argc <= 0) return;
 
-  internal::g_executable_path = internal::StreamableToString(argv[0]);
-
-#if GTEST_HAS_DEATH_TEST
-
   g_argvs.clear();
   for (int i = 0; i != *argc; i++) {
     g_argvs.push_back(StreamableToString(argv[i]));
   }
 
-#endif  // GTEST_HAS_DEATH_TEST
-
   ParseGoogleTestFlagsOnly(argc, argv);
   GetUnitTestImpl()->PostFlagParsingInit();
 }
@@ -4856,23 +5368,21 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 //
 // Calling the function for the second time has no user-visible effect.
 void InitGoogleTest(int* argc, char** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
 // This overloaded version can be used in Windows programs compiled in
 // UNICODE mode.
 void InitGoogleTest(int* argc, wchar_t** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
-}
-
-// Pin the vtables to this file.
-Environment::~Environment() {}
-TestPartResultReporterInterface::~TestPartResultReporterInterface() {}
-TestEventListener::~TestEventListener() {}
-void EmptyTestEventListener::anchor() {}
-namespace internal {
-OsStackTraceGetterInterface::~OsStackTraceGetterInterface() {}
-ParameterizedTestCaseInfoBase::~ParameterizedTestCaseInfoBase() {}
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
 }  // namespace testing
-- 
2.45.2