diff -pruN 0.17.0-11/BUILD 1.0.0-2/BUILD
--- 0.17.0-11/BUILD	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/BUILD	2022-07-27 11:48:16.000000000 +0000
@@ -141,6 +141,7 @@ cc_library(
     name = "hwy",
     srcs = [
         "hwy/aligned_allocator.cc",
+        "hwy/per_target.cc",
         "hwy/print.cc",
         "hwy/targets.cc",
     ],
@@ -163,6 +164,7 @@ cc_library(
         # End of list
         "hwy/highway.h",  # public
         "hwy/foreach_target.h",  # public
+        "hwy/per_target.h",  # public
         "hwy/print-inl.h",  # public
         "hwy/highway_export.h",  # public
         "hwy/ops/arm_neon-inl.h",
@@ -321,10 +323,14 @@ HWY_TESTS = [
     ("hwy/tests/", "crypto_test"),
     ("hwy/tests/", "demote_test"),
     ("hwy/tests/", "float_test"),
+    ("hwy/tests/", "if_test"),
+    ("hwy/tests/", "interleaved_test"),
     ("hwy/tests/", "logical_test"),
     ("hwy/tests/", "mask_test"),
+    ("hwy/tests/", "mask_mem_test"),
     ("hwy/tests/", "memory_test"),
     ("hwy/tests/", "mul_test"),
+    ("hwy/tests/", "reduction_test"),
     ("hwy/tests/", "reverse_test"),
     ("hwy/tests/", "shift_test"),
     ("hwy/tests/", "swizzle_test"),
diff -pruN 0.17.0-11/CMakeLists.txt 1.0.0-2/CMakeLists.txt
--- 0.17.0-11/CMakeLists.txt	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/CMakeLists.txt	2022-07-27 11:48:16.000000000 +0000
@@ -19,7 +19,7 @@ if(POLICY CMP0083)
   cmake_policy(SET CMP0083 NEW)
 endif()
 
-project(hwy VERSION 0.17.0)  # Keep in sync with highway.h version
+project(hwy VERSION 1.0.0)  # Keep in sync with highway.h version
 
 # Directly define the ABI version from the cmake project() version values:
 set(LIBRARY_VERSION "${hwy_VERSION}")
@@ -63,13 +63,25 @@ check_cxx_source_compiles(
   HWY_EMSCRIPTEN
 )
 
+check_cxx_source_compiles(
+   "int main() {
+      #if !defined(__riscv)
+      static_assert(false, \"__riscv is not defined\");
+      #endif
+      return 0;
+    }"
+  HWY_RISCV
+)
+
 if (HWY_ENABLE_CONTRIB)
-set(HWY_CONTRIB_SOURCES
+# Glob all the traits so we don't need to modify this file when adding
+# additional special cases.
+file(GLOB HWY_CONTRIB_SOURCES "hwy/contrib/sort/vqsort_*.cc")
+list(APPEND HWY_CONTRIB_SOURCES
     hwy/contrib/dot/dot-inl.h
     hwy/contrib/image/image.cc
     hwy/contrib/image/image.h
     hwy/contrib/math/math-inl.h
-    hwy/contrib/sort/disabled_targets.h
     hwy/contrib/sort/shared-inl.h
     hwy/contrib/sort/sorting_networks-inl.h
     hwy/contrib/sort/traits-inl.h
@@ -77,24 +89,6 @@ set(HWY_CONTRIB_SOURCES
     hwy/contrib/sort/vqsort-inl.h
     hwy/contrib/sort/vqsort.cc
     hwy/contrib/sort/vqsort.h
-    hwy/contrib/sort/vqsort_128a.cc
-    hwy/contrib/sort/vqsort_128d.cc
-    hwy/contrib/sort/vqsort_f32a.cc
-    hwy/contrib/sort/vqsort_f32d.cc
-    hwy/contrib/sort/vqsort_f64a.cc
-    hwy/contrib/sort/vqsort_f64d.cc
-    hwy/contrib/sort/vqsort_i16a.cc
-    hwy/contrib/sort/vqsort_i16d.cc
-    hwy/contrib/sort/vqsort_i32a.cc
-    hwy/contrib/sort/vqsort_i32d.cc
-    hwy/contrib/sort/vqsort_i64a.cc
-    hwy/contrib/sort/vqsort_i64d.cc
-    hwy/contrib/sort/vqsort_u16a.cc
-    hwy/contrib/sort/vqsort_u16d.cc
-    hwy/contrib/sort/vqsort_u32a.cc
-    hwy/contrib/sort/vqsort_u32d.cc
-    hwy/contrib/sort/vqsort_u64a.cc
-    hwy/contrib/sort/vqsort_u64d.cc
 )
 endif()  # HWY_ENABLE_CONTRIB
 
@@ -121,6 +115,8 @@ set(HWY_SOURCES
     hwy/ops/x86_128-inl.h
     hwy/ops/x86_256-inl.h
     hwy/ops/x86_512-inl.h
+    hwy/per_target.cc
+    hwy/per_target.h
     hwy/print-inl.h
     hwy/print.cc
     hwy/print.h
@@ -225,10 +221,23 @@ else()
     )
   endif()  # HWY_CMAKE_ARM7
 
+  if(HWY_RISCV)
+    list(APPEND HWY_FLAGS -march=rv64gcv1p0)
+    if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+      list(APPEND HWY_FLAGS -menable-experimental-extensions)
+    endif()
+  endif()
+
   if (HWY_WARNINGS_ARE_ERRORS)
     list(APPEND HWY_FLAGS -Werror)
   endif()
 
+  # Prevent "wasm-ld: error: --shared-memory is disallowed by targets.cc.o
+  # because it was not compiled with 'atomics' or 'bulk-memory' features."
+  if (HWY_EMSCRIPTEN)
+    list(APPEND HWY_FLAGS -matomics)
+  endif()
+
 endif()  # !MSVC
 
 # By default prefer STATIC build (legacy behavior)
@@ -281,6 +290,13 @@ target_include_directories(hwy_contrib P
     $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 target_compile_features(hwy_contrib PUBLIC cxx_std_11)
+set_target_properties(hwy_contrib PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
+if(UNIX AND NOT APPLE)
+  set_property(TARGET hwy_contrib APPEND_STRING PROPERTY
+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
+endif()
 endif()  # HWY_ENABLE_CONTRIB
 
 add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
@@ -292,6 +308,13 @@ target_include_directories(hwy_test PUBL
     $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 target_compile_features(hwy_test PUBLIC cxx_std_11)
+set_target_properties(hwy_test PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
+if(UNIX AND NOT APPLE)
+  set_property(TARGET hwy_test APPEND_STRING PROPERTY
+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
+endif()
 
 # -------------------------------------------------------- hwy_list_targets
 # Generate a tool to print the compiled-in targets as defined by the current
@@ -384,8 +407,8 @@ set(gtest_force_shared_crt ON CACHE BOOL
 add_executable(hwy_benchmark hwy/examples/benchmark.cc)
 target_sources(hwy_benchmark PRIVATE
     hwy/nanobenchmark.h)
-# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
-# observe the difference in targets printed.
+# Try adding one of -DHWY_COMPILE_ONLY_SCALAR, -DHWY_COMPILE_ONLY_EMU128 or
+# -DHWY_COMPILE_ONLY_STATIC to observe the difference in targets printed.
 target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
 target_link_libraries(hwy_benchmark hwy)
 set_target_properties(hwy_benchmark
@@ -444,6 +467,7 @@ set(HWY_TEST_FILES
   hwy/aligned_allocator_test.cc
   hwy/base_test.cc
   hwy/highway_test.cc
+  hwy/nanobenchmark_test.cc
   hwy/targets_test.cc
   hwy/examples/skeleton_test.cc
   hwy/tests/arithmetic_test.cc
@@ -451,14 +475,20 @@ set(HWY_TEST_FILES
   hwy/tests/blockwise_shift_test.cc
   hwy/tests/combine_test.cc
   hwy/tests/compare_test.cc
+  hwy/tests/compress_test.cc
   hwy/tests/convert_test.cc
   hwy/tests/crypto_test.cc
   hwy/tests/demote_test.cc
   hwy/tests/float_test.cc
+  hwy/tests/if_test.cc
+  hwy/tests/interleaved_test.cc
   hwy/tests/logical_test.cc
   hwy/tests/mask_test.cc
+  hwy/tests/mask_mem_test.cc
   hwy/tests/memory_test.cc
   hwy/tests/mul_test.cc
+  hwy/tests/reduction_test.cc
+  hwy/tests/reverse_test.cc
   hwy/tests/shift_test.cc
   hwy/tests/swizzle_test.cc
   hwy/tests/test_util_test.cc
diff -pruN 0.17.0-11/debian/changelog 1.0.0-2/debian/changelog
--- 0.17.0-11/debian/changelog	2022-07-20 15:21:46.000000000 +0000
+++ 1.0.0-2/debian/changelog	2022-08-02 05:44:42.000000000 +0000
@@ -1,3 +1,20 @@
+highway (1.0.0-2) unstable; urgency=medium
+
+  * d/symbols: Fix symbols file on 32bits arch
+
+ -- Mathieu Malaterre <malat@debian.org>  Tue, 02 Aug 2022 07:44:42 +0200
+
+highway (1.0.0-1) unstable; urgency=medium
+
+  * d/watch: Fix typo in watch file
+  * d/copyright: Simplify copyright file
+  * New upstream version 1.0.0
+  * d/rules: Build SCALAR instead of EMU128
+  * d/patches: Remove patches applied upstream
+  * d/symbols: Update symbols file for new ABI
+
+ -- Mathieu Malaterre <malat@debian.org>  Mon, 01 Aug 2022 10:50:07 +0200
+
 highway (0.17.0-11) unstable; urgency=medium
 
   * d/patches: Fix blockwise_test on big-endian arches
diff -pruN 0.17.0-11/debian/control 1.0.0-2/debian/control
--- 0.17.0-11/debian/control	2022-07-18 13:06:02.000000000 +0000
+++ 1.0.0-2/debian/control	2022-08-01 08:41:09.000000000 +0000
@@ -11,7 +11,7 @@ Vcs-Git: https://salsa.debian.org/debian
 Vcs-Browser: https://salsa.debian.org/debian-phototools-team/highway
 Rules-Requires-Root: no
 
-Package: libhwy0
+Package: libhwy1
 Architecture: any
 Pre-Depends: ${misc:Pre-Depends}
 Depends: ${misc:Depends}, ${shlibs:Depends}
@@ -28,7 +28,7 @@ Package: libhwy-dev
 Architecture: any
 Section: libdevel
 Pre-Depends: ${misc:Pre-Depends}
-Depends: libhwy0 (= ${binary:Version}), ${misc:Depends}
+Depends: libhwy1 (= ${binary:Version}), ${misc:Depends}
 Multi-Arch: same
 Description: Efficient and performance-portable SIMD wrapper (developer files)
  This library provides type-safe and source-code portable wrappers over
diff -pruN 0.17.0-11/debian/copyright 1.0.0-2/debian/copyright
--- 0.17.0-11/debian/copyright	2022-07-11 16:13:41.000000000 +0000
+++ 1.0.0-2/debian/copyright	2022-08-01 07:29:20.000000000 +0000
@@ -1,7 +1,6 @@
 Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 Upstream-Name: highway
 Source: https://github.com/google/highway
-Files-Excluded: debian
 
 Files: *
 Copyright: 2019-2022 Google LLC
diff -pruN 0.17.0-11/debian/libhwy0.install 1.0.0-2/debian/libhwy0.install
--- 0.17.0-11/debian/libhwy0.install	2022-07-11 16:13:41.000000000 +0000
+++ 1.0.0-2/debian/libhwy0.install	1970-01-01 00:00:00.000000000 +0000
@@ -1 +0,0 @@
-usr/lib/*/*.so.*
diff -pruN 0.17.0-11/debian/libhwy0.symbols 1.0.0-2/debian/libhwy0.symbols
--- 0.17.0-11/debian/libhwy0.symbols	2022-07-18 13:06:02.000000000 +0000
+++ 1.0.0-2/debian/libhwy0.symbols	1970-01-01 00:00:00.000000000 +0000
@@ -1,92 +0,0 @@
-libhwy.so.0 libhwy0 #MINVER#
- HWY_0@HWY_0 0.16.0
- _ZN3hwy12ChosenTarget6UpdateEv@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy14AlignedDeleter18DeleteAlignedArrayEPvPFvS1_S1_ES1_PFvS1_jE@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy14AlignedDeleter18DeleteAlignedArrayEPvPFvS1_S1_ES1_PFvS1_mE@HWY_0 0.16.0
- _ZN3hwy14DisableTargetsEj@HWY_0 0.16.0
- _ZN3hwy14Unpredictable1Ev@HWY_0 0.16.0
- _ZN3hwy15GetChosenTargetEv@HWY_0 0.16.0
- _ZN3hwy16FreeAlignedBytesEPKvPFvPvS2_ES2_@HWY_0 0.16.0
- _ZN3hwy16SupportedTargetsEv@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy20AllocateAlignedBytesEjPFPvS0_jES0_@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy20AllocateAlignedBytesEmPFPvS0_mES0_@HWY_0 0.16.0
- _ZN3hwy26SetSupportedTargetsForTestEj@HWY_0 0.16.0
- _ZN3hwy29SupportedTargetsCalledForTestEv@HWY_0 0.16.0
- _ZN3hwy5AbortEPKciS1_z@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy6detail10PrintArrayERKNS0_8TypeInfoEPKcPKvjjj@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy6detail10PrintArrayERKNS0_8TypeInfoEPKcPKvmmm@HWY_0 0.16.0
- _ZN3hwy6detail8ToStringERKNS0_8TypeInfoEPKvPc@HWY_0 0.17.0
- (arch-bits=32)_ZN3hwy6detail8TypeNameERKNS0_8TypeInfoEjPc@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy6detail8TypeNameERKNS0_8TypeInfoEmPc@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy7MeasureEPFyPKvjEPKhPKjjPNS_6ResultERKNS_6ParamsE@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy7MeasureEPFmPKvmEPKhPKmmPNS_6ResultERKNS_6ParamsE@HWY_0 0.16.0
- _ZN3hwy8platform15TimerResolutionEv@HWY_0 0.16.0
- _ZN3hwy8platform23InvariantTicksPerSecondEv@HWY_0 0.16.0
- _ZN3hwy8platform3NowEv@HWY_0 0.16.0
-libhwy_contrib.so.0 libhwy0 #MINVER#
- HWY_0@HWY_0 0.17.0
- (arch-bits=32)_ZN3hwy6Sorter11Fill24BytesEPKvjPv@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy6Sorter11Fill24BytesEPKvmPv@HWY_0 0.16.0
- _ZN3hwy6Sorter11HaveFloat64Ev@HWY_0 0.16.0
- _ZN3hwy6Sorter6DeleteEv@HWY_0 0.16.0
- _ZN3hwy6SorterC1Ev@HWY_0 0.16.0
- _ZN3hwy6SorterC2Ev@HWY_0 0.16.0
- _ZN3hwy9ImageBase10VectorSizeEv@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy9ImageBase11BytesPerRowEjj@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy9ImageBase17InitializePaddingEjNS0_7PaddingE@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy9ImageBase11BytesPerRowEmm@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy9ImageBase17InitializePaddingEmNS0_7PaddingE@HWY_0 0.16.0
- _ZN3hwy9ImageBase4SwapERS0_@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy9ImageBaseC1Ejjj@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy9ImageBaseC1EjjjPv@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy9ImageBaseC2Ejjj@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy9ImageBaseC2EjjjPv@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPNS_9uint128_tEjNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPNS_9uint128_tEjNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPdjNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPdjNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPfjNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPfjNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPijNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPijNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPjjNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPjjNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPsjNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPsjNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPtjNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPtjNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPxjNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPxjNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPyjNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=32)_ZNK3hwy6SorterclEPyjNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy9ImageBaseC1Emmm@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy9ImageBaseC1EmmmPv@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy9ImageBaseC2Emmm@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy9ImageBaseC2EmmmPv@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPNS_9uint128_tEmNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPNS_9uint128_tEmNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPdmNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPdmNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPfmNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPfmNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPimNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPimNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPjmNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPjmNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPlmNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPlmNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPmmNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPmmNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPsmNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPsmNS_14SortDescendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPtmNS_13SortAscendingE@HWY_0 0.16.0
- (arch-bits=64)_ZNK3hwy6SorterclEPtmNS_14SortDescendingE@HWY_0 0.16.0
-libhwy_test.so.0 libhwy0 #MINVER#
- HWY_0@HWY_0 0.17.0
- (arch-bits=32)_ZN3hwy10BytesEqualEPKvS1_jPj@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy6detail16AssertArrayEqualERKNS0_8TypeInfoEPKvS5_jPKcS7_i@HWY_0 0.16.0
- (arch-bits=32)_ZN3hwy6detail21PrintMismatchAndAbortERKNS0_8TypeInfoEPKvS5_PKcS7_ijj@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy10BytesEqualEPKvS1_mPm@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy6detail16AssertArrayEqualERKNS0_8TypeInfoEPKvS5_mPKcS7_i@HWY_0 0.16.0
- (arch-bits=64)_ZN3hwy6detail21PrintMismatchAndAbortERKNS0_8TypeInfoEPKvS5_PKcS7_imm@HWY_0 0.16.0
- _ZN3hwy6detail7IsEqualERKNS0_8TypeInfoEPKvS5_@HWY_0 0.16.0
diff -pruN 0.17.0-11/debian/libhwy1.install 1.0.0-2/debian/libhwy1.install
--- 0.17.0-11/debian/libhwy1.install	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/debian/libhwy1.install	2022-08-01 07:29:20.000000000 +0000
@@ -0,0 +1 @@
+usr/lib/*/*.so.*
diff -pruN 0.17.0-11/debian/libhwy1.symbols 1.0.0-2/debian/libhwy1.symbols
--- 0.17.0-11/debian/libhwy1.symbols	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/debian/libhwy1.symbols	2022-08-02 05:41:57.000000000 +0000
@@ -0,0 +1,96 @@
+libhwy.so.1 libhwy1 #MINVER#
+ HWY_0@HWY_0 1.0.0
+ (arch-bits=32)_ZN3hwy14AlignedDeleter18DeleteAlignedArrayEPvPFvS1_S1_ES1_PFvS1_jE@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy14AlignedDeleter18DeleteAlignedArrayEPvPFvS1_S1_ES1_PFvS1_mE@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy14DisableTargetsEx@HWY_0 1.0.0
+ (arch-bits=64)_ZN3hwy14DisableTargetsEl@HWY_0 1.0.0
+ _ZN3hwy14Unpredictable1Ev@HWY_0 0.16.0
+ _ZN3hwy15GetChosenTargetEv@HWY_0 0.16.0
+ _ZN3hwy16FreeAlignedBytesEPKvPFvPvS2_ES2_@HWY_0 0.16.0
+ _ZN3hwy16SupportedTargetsEv@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy20AllocateAlignedBytesEjPFPvS0_jES0_@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy20AllocateAlignedBytesEmPFPvS0_mES0_@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy26SetSupportedTargetsForTestEx@HWY_0 1.0.0
+ (arch-bits=64)_ZN3hwy26SetSupportedTargetsForTestEl@HWY_0 1.0.0
+ _ZN3hwy5AbortEPKciS1_z@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy6detail10PrintArrayERKNS0_8TypeInfoEPKcPKvjjj@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy6detail10PrintArrayERKNS0_8TypeInfoEPKcPKvmmm@HWY_0 0.16.0
+ _ZN3hwy6detail8ToStringERKNS0_8TypeInfoEPKvPc@HWY_0 0.17.0
+ (arch-bits=32)_ZN3hwy6detail8TypeNameERKNS0_8TypeInfoEjPc@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy6detail8TypeNameERKNS0_8TypeInfoEmPc@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy7MeasureEPFyPKvjEPKhPKjjPNS_6ResultERKNS_6ParamsE@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy7MeasureEPFmPKvmEPKhPKmmPNS_6ResultERKNS_6ParamsE@HWY_0 0.16.0
+ _ZN3hwy8platform15TimerResolutionEv@HWY_0 0.16.0
+ _ZN3hwy8platform23InvariantTicksPerSecondEv@HWY_0 0.16.0
+ _ZN3hwy8platform3NowEv@HWY_0 0.16.0
+libhwy_contrib.so.1 libhwy1 #MINVER#
+ HWY_0@HWY_0 1.0.0
+ (arch-bits=32)_ZN3hwy6Sorter11Fill24BytesEPKvjPv@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy6Sorter11Fill24BytesEPKvmPv@HWY_0 0.16.0
+ _ZN3hwy6Sorter11HaveFloat64Ev@HWY_0 0.16.0
+ _ZN3hwy6Sorter6DeleteEv@HWY_0 0.16.0
+ _ZN3hwy6SorterC1Ev@HWY_0 0.16.0
+ _ZN3hwy6SorterC2Ev@HWY_0 0.16.0
+ _ZN3hwy9ImageBase10VectorSizeEv@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy9ImageBase11BytesPerRowEjj@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy9ImageBase17InitializePaddingEjNS0_7PaddingE@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy9ImageBase11BytesPerRowEmm@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy9ImageBase17InitializePaddingEmNS0_7PaddingE@HWY_0 0.16.0
+ _ZN3hwy9ImageBase4SwapERS0_@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy9ImageBaseC1Ejjj@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy9ImageBaseC1EjjjPv@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy9ImageBaseC2Ejjj@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy9ImageBaseC2EjjjPv@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPNS_6K64V64EjNS_13SortAscendingE@HWY_0 1.0.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPNS_6K64V64EjNS_14SortDescendingE@HWY_0 1.0.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPNS_9uint128_tEjNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPNS_9uint128_tEjNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPdjNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPdjNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPfjNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPfjNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPijNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPijNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPjjNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPjjNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPsjNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPsjNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPtjNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPtjNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPxjNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPxjNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPyjNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=32)_ZNK3hwy6SorterclEPyjNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy9ImageBaseC1Emmm@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy9ImageBaseC1EmmmPv@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy9ImageBaseC2Emmm@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy9ImageBaseC2EmmmPv@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPNS_6K64V64EmNS_13SortAscendingE@HWY_0 1.0.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPNS_6K64V64EmNS_14SortDescendingE@HWY_0 1.0.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPNS_9uint128_tEmNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPNS_9uint128_tEmNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPdmNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPdmNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPfmNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPfmNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPimNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPimNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPjmNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPjmNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPlmNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPlmNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPmmNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPmmNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPsmNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPsmNS_14SortDescendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPtmNS_13SortAscendingE@HWY_0 0.16.0
+ (arch-bits=64)_ZNK3hwy6SorterclEPtmNS_14SortDescendingE@HWY_0 0.16.0
+libhwy_test.so.1 libhwy1 #MINVER#
+ HWY_0@HWY_0 1.0.0
+ (arch-bits=32)_ZN3hwy10BytesEqualEPKvS1_jPj@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy6detail16AssertArrayEqualERKNS0_8TypeInfoEPKvS5_jPKcS7_i@HWY_0 0.16.0
+ (arch-bits=32)_ZN3hwy6detail21PrintMismatchAndAbortERKNS0_8TypeInfoEPKvS5_PKcS7_ijj@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy10BytesEqualEPKvS1_mPm@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy6detail16AssertArrayEqualERKNS0_8TypeInfoEPKvS5_mPKcS7_i@HWY_0 0.16.0
+ (arch-bits=64)_ZN3hwy6detail21PrintMismatchAndAbortERKNS0_8TypeInfoEPKvS5_PKcS7_imm@HWY_0 0.16.0
+ _ZN3hwy6detail7IsEqualERKNS0_8TypeInfoEPKvS5_@HWY_0 0.16.0
diff -pruN 0.17.0-11/debian/patches/4429b67a81bee50003e9c004e0c63581252e5274.patch 1.0.0-2/debian/patches/4429b67a81bee50003e9c004e0c63581252e5274.patch
--- 0.17.0-11/debian/patches/4429b67a81bee50003e9c004e0c63581252e5274.patch	2022-07-18 13:06:02.000000000 +0000
+++ 1.0.0-2/debian/patches/4429b67a81bee50003e9c004e0c63581252e5274.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,35 +0,0 @@
-From 4429b67a81bee50003e9c004e0c63581252e5274 Mon Sep 17 00:00:00 2001
-From: Jan Wassenberg <janwas@google.com>
-Date: Mon, 27 Jun 2022 06:43:13 -0700
-Subject: [PATCH] another big-endian fix (ReorderWidenMulAccumulate). Refs
- #775, thanks @malaterre
-
-PiperOrigin-RevId: 457465200
----
- hwy/ops/emu128-inl.h | 13 ++++++-------
- 1 file changed, 6 insertions(+), 7 deletions(-)
-
-Index: highway-0.17.0/hwy/ops/emu128-inl.h
-===================================================================
---- highway-0.17.0.orig/hwy/ops/emu128-inl.h
-+++ highway-0.17.0/hwy/ops/emu128-inl.h
-@@ -2183,13 +2183,12 @@ HWY_API Vec128<float, N> ReorderWidenMul
-                                                    Vec128<bfloat16_t, 2 * N> b,
-                                                    const Vec128<float, N> sum0,
-                                                    Vec128<float, N>& sum1) {
--  const Repartition<uint16_t, decltype(df32)> du16;
--  const RebindToUnsigned<decltype(df32)> du32;
--  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
--  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
--  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
--  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
--  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
-+  const Rebind<bfloat16_t, decltype(df32)> dbf16;
-+  // Avoid ZipLower/Upper so this also works on big-endian systems.
-+  const Vec128<float, N> a0 = PromoteTo(df32, LowerHalf(dbf16, a));
-+  const Vec128<float, N> a1 = PromoteTo(df32, UpperHalf(dbf16, a));
-+  const Vec128<float, N> b0 = PromoteTo(df32, LowerHalf(dbf16, b));
-+  const Vec128<float, N> b1 = PromoteTo(df32, UpperHalf(dbf16, b));
-   sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
- }
diff -pruN 0.17.0-11/debian/patches/46cac7ed326bb6c49fb3f3f572dd94161bc62ee8.patch 1.0.0-2/debian/patches/46cac7ed326bb6c49fb3f3f572dd94161bc62ee8.patch
--- 0.17.0-11/debian/patches/46cac7ed326bb6c49fb3f3f572dd94161bc62ee8.patch	2022-07-20 15:20:28.000000000 +0000
+++ 1.0.0-2/debian/patches/46cac7ed326bb6c49fb3f3f572dd94161bc62ee8.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,56 +0,0 @@
-From 46cac7ed326bb6c49fb3f3f572dd94161bc62ee8 Mon Sep 17 00:00:00 2001
-From: Jan Wassenberg <janwas@google.com>
-Date: Wed, 20 Jul 2022 06:20:25 -0700
-Subject: [PATCH] Fix blockwise_test for big-endian scalar. Fixes #858
-
-PiperOrigin-RevId: 462125090
----
- hwy/tests/blockwise_test.cc | 12 ++++++++++--
- 1 file changed, 10 insertions(+), 2 deletions(-)
-
-diff --git a/hwy/tests/blockwise_test.cc b/hwy/tests/blockwise_test.cc
-index 63a4fe43..84554d4f 100644
---- a/hwy/tests/blockwise_test.cc
-+++ b/hwy/tests/blockwise_test.cc
-@@ -248,17 +248,21 @@ struct TestZipLower {
-     const auto even = Load(d, even_lanes.get());
-     const auto odd = Load(d, odd_lanes.get());
- 
-+    const Repartition<WideT, D> dw;
-+#if HWY_TARGET == HWY_SCALAR
-+    // Safely handle big-endian
-+    const auto expected = Set(dw, static_cast<WideT>(1ULL << (sizeof(T) * 8)));
-+#else
-     const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
--
-     for (size_t i = 0; i < N; i += 2) {
-       const size_t base = (i / blockN) * blockN;
-       const size_t mod = i % blockN;
-       zip_lanes[i + 0] = even_lanes[mod / 2 + base];
-       zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
-     }
--    const Repartition<WideT, D> dw;
-     const auto expected =
-         Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
-+#endif  // HWY_TARGET == HWY_SCALAR
-     HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd));
-     HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd));
-   }
-@@ -267,6 +271,9 @@ struct TestZipLower {
- struct TestZipUpper {
-   template <class T, class D>
-   HWY_NOINLINE void operator()(T /*unused*/, D d) {
-+#if HWY_TARGET == HWY_SCALAR
-+    (void)d;
-+#else
-     using WideT = MakeWide<T>;
-     static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
-     static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
-@@ -295,6 +302,7 @@ struct TestZipUpper {
-     const auto expected =
-         Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
-     HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd));
-+#endif  // HWY_TARGET == HWY_SCALAR
-   }
- };
- 
diff -pruN 0.17.0-11/debian/patches/94b156ebbd9eb40913366238d96138669ca3f80b.patch 1.0.0-2/debian/patches/94b156ebbd9eb40913366238d96138669ca3f80b.patch
--- 0.17.0-11/debian/patches/94b156ebbd9eb40913366238d96138669ca3f80b.patch	2022-07-18 13:06:02.000000000 +0000
+++ 1.0.0-2/debian/patches/94b156ebbd9eb40913366238d96138669ca3f80b.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,23 +0,0 @@
-From 94b156ebbd9eb40913366238d96138669ca3f80b Mon Sep 17 00:00:00 2001
-From: Moritz Firsching <firsching@google.com>
-Date: Wed, 22 Jun 2022 04:09:38 -0700
-Subject: [PATCH] make conversion explicit
-
-PiperOrigin-RevId: 456479531
----
- hwy/ops/scalar-inl.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h
-index 6f4ed915..660c0a65 100644
---- a/hwy/ops/scalar-inl.h
-+++ b/hwy/ops/scalar-inl.h
-@@ -722,7 +722,7 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
-   const TI rounded = static_cast<TI>(v.raw + bias);
-   if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
-   // Round to even
--  if ((rounded & 1) && std::abs(rounded - v.raw) == T(0.5)) {
-+  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
-     return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
-   }
-   return Vec1<T>(static_cast<T>(rounded));
diff -pruN 0.17.0-11/debian/patches/a1302c69192cfbea44cd5330ac828d40fba958e2.patch 1.0.0-2/debian/patches/a1302c69192cfbea44cd5330ac828d40fba958e2.patch
--- 0.17.0-11/debian/patches/a1302c69192cfbea44cd5330ac828d40fba958e2.patch	2022-07-18 13:06:02.000000000 +0000
+++ 1.0.0-2/debian/patches/a1302c69192cfbea44cd5330ac828d40fba958e2.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,209 +0,0 @@
-From a1302c69192cfbea44cd5330ac828d40fba958e2 Mon Sep 17 00:00:00 2001
-From: Jan Wassenberg <janwas@google.com>
-Date: Wed, 8 Jun 2022 02:53:01 -0700
-Subject: [PATCH] GCC workaround: avoid reinterpret_cast. Refs #398 This caused
- incorrect codegen in mask_test for partial vectors (i16x2).
-
-PiperOrigin-RevId: 453634364
----
- hwy/base.h             |   2 +
- hwy/ops/arm_neon-inl.h | 104 +++++++++++++++--------------------------
- hwy/tests/mask_test.cc |   8 +---
- 3 files changed, 41 insertions(+), 73 deletions(-)
-
---- highway-0.17.0.orig/hwy/base.h
-+++ highway-0.17.0/hwy/base.h
-@@ -309,6 +309,8 @@ HWY_API constexpr bool IsSame() {
-   hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
- #define HWY_IF_NOT_LANE_SIZE(T, bytes) \
-   hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
-+#define HWY_IF_LANE_SIZE_LT(T, bytes) \
-+  hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
- 
- #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
-   hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
---- highway-0.17.0.orig/hwy/ops/arm_neon-inl.h
-+++ highway-0.17.0/hwy/ops/arm_neon-inl.h
-@@ -2625,61 +2625,49 @@ HWY_API Vec64<double> LoadU(Full64<doubl
-   return Vec64<double>(vld1_f64(p));
- }
- #endif
--
- // ------------------------------ Load 32
- 
--HWY_API Vec32<uint8_t> LoadU(Full32<uint8_t> /*tag*/,
--                             const uint8_t* HWY_RESTRICT p) {
--  uint32x2_t a = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
--  return Vec32<uint8_t>(vreinterpret_u8_u32(a));
--}
--HWY_API Vec32<uint16_t> LoadU(Full32<uint16_t> /*tag*/,
--                              const uint16_t* HWY_RESTRICT p) {
--  uint32x2_t a = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
--  return Vec32<uint16_t>(vreinterpret_u16_u32(a));
--}
-+// Actual 32-bit broadcast load - used to implement the other lane types
-+// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
- HWY_API Vec32<uint32_t> LoadU(Full32<uint32_t> /*tag*/,
-                               const uint32_t* HWY_RESTRICT p) {
--  return Vec32<uint32_t>(vld1_dup_u32(reinterpret_cast<const uint32_t*>(p)));
--}
--HWY_API Vec32<int8_t> LoadU(Full32<int8_t> /*tag*/,
--                            const int8_t* HWY_RESTRICT p) {
--  int32x2_t a = vld1_dup_s32(reinterpret_cast<const int32_t*>(p));
--  return Vec32<int8_t>(vreinterpret_s8_s32(a));
--}
--HWY_API Vec32<int16_t> LoadU(Full32<int16_t> /*tag*/,
--                             const int16_t* HWY_RESTRICT p) {
--  int32x2_t a = vld1_dup_s32(reinterpret_cast<const int32_t*>(p));
--  return Vec32<int16_t>(vreinterpret_s16_s32(a));
-+  return Vec32<uint32_t>(vld1_dup_u32(p));
- }
- HWY_API Vec32<int32_t> LoadU(Full32<int32_t> /*tag*/,
-                              const int32_t* HWY_RESTRICT p) {
--  return Vec32<int32_t>(vld1_dup_s32(reinterpret_cast<const int32_t*>(p)));
-+  return Vec32<int32_t>(vld1_dup_s32(p));
- }
- HWY_API Vec32<float> LoadU(Full32<float> /*tag*/, const float* HWY_RESTRICT p) {
-   return Vec32<float>(vld1_dup_f32(p));
- }
- 
-+template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
-+HWY_API Vec32<T> LoadU(Full32<T> d, const T* HWY_RESTRICT p) {
-+  const Repartition<uint32_t, decltype(d)> d32;
-+  uint32_t buf;
-+  CopyBytes<4>(p, &buf);
-+  return BitCast(d, LoadU(d32, &buf));
-+}
-+
- // ------------------------------ Load 16
- 
--HWY_API Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2, 0> /*tag*/,
--                                 const uint8_t* HWY_RESTRICT p) {
--  uint16x4_t a = vld1_dup_u16(reinterpret_cast<const uint16_t*>(p));
--  return Vec128<uint8_t, 2>(vreinterpret_u8_u16(a));
--}
-+// Actual 16-bit broadcast load - used to implement the other lane types
-+// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
- HWY_API Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1, 0> /*tag*/,
-                                   const uint16_t* HWY_RESTRICT p) {
--  return Vec128<uint16_t, 1>(
--      vld1_dup_u16(reinterpret_cast<const uint16_t*>(p)));
--}
--HWY_API Vec128<int8_t, 2> LoadU(Simd<int8_t, 2, 0> /*tag*/,
--                                const int8_t* HWY_RESTRICT p) {
--  int16x4_t a = vld1_dup_s16(reinterpret_cast<const int16_t*>(p));
--  return Vec128<int8_t, 2>(vreinterpret_s8_s16(a));
-+  return Vec128<uint16_t, 1>(vld1_dup_u16(p));
- }
- HWY_API Vec128<int16_t, 1> LoadU(Simd<int16_t, 1, 0> /*tag*/,
-                                  const int16_t* HWY_RESTRICT p) {
--  return Vec128<int16_t, 1>(vld1_dup_s16(reinterpret_cast<const int16_t*>(p)));
-+  return Vec128<int16_t, 1>(vld1_dup_s16(p));
-+}
-+
-+template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
-+HWY_API Vec128<T, 2> LoadU(Simd<T, 2, 0> d, const T* HWY_RESTRICT p) {
-+  const Repartition<uint16_t, decltype(d)> d16;
-+  uint16_t buf;
-+  CopyBytes<2>(p, &buf);
-+  return BitCast(d, LoadU(d16, &buf));
- }
- 
- // ------------------------------ Load 8
-@@ -2821,30 +2809,10 @@ HWY_API void StoreU(const Vec64<double>
- 
- // ------------------------------ Store 32
- 
--HWY_API void StoreU(const Vec32<uint8_t> v, Full32<uint8_t>,
--                    uint8_t* HWY_RESTRICT p) {
--  uint32x2_t a = vreinterpret_u32_u8(v.raw);
--  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
--}
--HWY_API void StoreU(const Vec32<uint16_t> v, Full32<uint16_t>,
--                    uint16_t* HWY_RESTRICT p) {
--  uint32x2_t a = vreinterpret_u32_u16(v.raw);
--  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
--}
- HWY_API void StoreU(const Vec32<uint32_t> v, Full32<uint32_t>,
-                     uint32_t* HWY_RESTRICT p) {
-   vst1_lane_u32(p, v.raw, 0);
- }
--HWY_API void StoreU(const Vec32<int8_t> v, Full32<int8_t>,
--                    int8_t* HWY_RESTRICT p) {
--  int32x2_t a = vreinterpret_s32_s8(v.raw);
--  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
--}
--HWY_API void StoreU(const Vec32<int16_t> v, Full32<int16_t>,
--                    int16_t* HWY_RESTRICT p) {
--  int32x2_t a = vreinterpret_s32_s16(v.raw);
--  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
--}
- HWY_API void StoreU(const Vec32<int32_t> v, Full32<int32_t>,
-                     int32_t* HWY_RESTRICT p) {
-   vst1_lane_s32(p, v.raw, 0);
-@@ -2854,27 +2822,31 @@ HWY_API void StoreU(const Vec32<float> v
-   vst1_lane_f32(p, v.raw, 0);
- }
- 
-+template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
-+HWY_API void StoreU(const Vec32<T> v, Full32<T> d, T* HWY_RESTRICT p) {
-+  const Repartition<uint32_t, decltype(d)> d32;
-+  const uint32_t buf = GetLane(BitCast(d32, v));
-+  CopyBytes<4>(&buf, p);
-+}
-+
- // ------------------------------ Store 16
- 
--HWY_API void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2, 0>,
--                    uint8_t* HWY_RESTRICT p) {
--  uint16x4_t a = vreinterpret_u16_u8(v.raw);
--  vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
--}
- HWY_API void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1, 0>,
-                     uint16_t* HWY_RESTRICT p) {
-   vst1_lane_u16(p, v.raw, 0);
- }
--HWY_API void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2, 0>,
--                    int8_t* HWY_RESTRICT p) {
--  int16x4_t a = vreinterpret_s16_s8(v.raw);
--  vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
--}
- HWY_API void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1, 0>,
-                     int16_t* HWY_RESTRICT p) {
-   vst1_lane_s16(p, v.raw, 0);
- }
- 
-+template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
-+HWY_API void StoreU(const Vec128<T, 2> v, Simd<T, 2, 0> d, T* HWY_RESTRICT p) {
-+  const Repartition<uint16_t, decltype(d)> d16;
-+  const uint16_t buf = GetLane(BitCast(d16, v));
-+  CopyBytes<2>(&buf, p);
-+}
-+
- // ------------------------------ Store 8
- 
- HWY_API void StoreU(const Vec128<uint8_t, 1> v, Simd<uint8_t, 1, 0>,
---- highway-0.17.0.orig/hwy/tests/mask_test.cc
-+++ highway-0.17.0/hwy/tests/mask_test.cc
-@@ -237,8 +237,6 @@ struct TestAllTrueFalse {
-     auto lanes = AllocateAligned<T>(N);
-     std::fill(lanes.get(), lanes.get() + N, T(0));
- 
--    auto mask_lanes = AllocateAligned<T>(N);
--
-     HWY_ASSERT(AllTrue(d, Eq(v, zero)));
-     HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
- 
-@@ -251,11 +249,7 @@ struct TestAllTrueFalse {
-       lanes[i] = T(1);
-       v = Load(d, lanes.get());
- 
--      // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
--      // Assigning to an lvalue is insufficient but storing to memory prevents
--      // the bug; so does Print of VecFromMask(d, Eq(v, zero)).
--      Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
--      HWY_ASSERT(!AllTrue(d, MaskFromVec(Load(d, mask_lanes.get()))));
-+      HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
- 
-       HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
- 
diff -pruN 0.17.0-11/debian/patches/a80e816ef9db9de04239f6ad3db4f4db991a544f.patch 1.0.0-2/debian/patches/a80e816ef9db9de04239f6ad3db4f4db991a544f.patch
--- 0.17.0-11/debian/patches/a80e816ef9db9de04239f6ad3db4f4db991a544f.patch	2022-07-18 13:06:02.000000000 +0000
+++ 1.0.0-2/debian/patches/a80e816ef9db9de04239f6ad3db4f4db991a544f.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,52 +0,0 @@
-From a80e816ef9db9de04239f6ad3db4f4db991a544f Mon Sep 17 00:00:00 2001
-From: Jan Wassenberg <janwas@google.com>
-Date: Mon, 27 Jun 2022 04:38:04 -0700
-Subject: [PATCH] hopefully fix big-endian issue with BF16. Refs #775
-
-PiperOrigin-RevId: 457445742
----
- hwy/ops/emu128-inl.h     | 7 ++++---
- hwy/tests/demote_test.cc | 9 +++++----
- 2 files changed, 9 insertions(+), 7 deletions(-)
-
-Index: highway-0.17.0/hwy/ops/emu128-inl.h
-===================================================================
---- highway-0.17.0.orig/hwy/ops/emu128-inl.h
-+++ highway-0.17.0/hwy/ops/emu128-inl.h
-@@ -1433,10 +1433,11 @@ HWY_API Vec128<ToT, N> DemoteTo(Simd<ToT
- template <size_t N>
- HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
-     Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
--  const RebindToUnsigned<decltype(dbf16)> du16;
-   const Repartition<uint32_t, decltype(dbf16)> du32;
--  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
--  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
-+  const Vec128<uint32_t, N> b_in_lower = ShiftRight<16>(BitCast(du32, b));
-+  // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
-+  const Vec128<uint32_t, N> a_mask = Set(du32, 0xFFFF0000);
-+  return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
- }
- 
- namespace detail {
-Index: highway-0.17.0/hwy/tests/demote_test.cc
-===================================================================
---- highway-0.17.0.orig/hwy/tests/demote_test.cc
-+++ highway-0.17.0/hwy/tests/demote_test.cc
-@@ -235,12 +235,13 @@ class TestReorderDemote2To {
-       const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
- 
-       // Smoke test: sum should be same (with tolerance for non-associativity)
--      const auto sum_expected =
-+      const auto sum_expected = GetLane(SumOfLanes(d32, Add(f0, f1)));
-+      const auto sum_actual =
-           GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
--      const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
--      HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
--                 sum_expected <= sum_actual + 1E-4);
- 
-+      HWY_ASSERT(sum_expected - 1E-4 <= sum_actual &&
-+                 sum_actual <= sum_expected + 1E-4);
-+ 
-       // Ensure values are the same after sorting to undo the Reorder
-       Store(f0, d32, expected.get() + 0);
-       Store(f1, d32, expected.get() + N);
diff -pruN 0.17.0-11/debian/patches/base_symbols.patch 1.0.0-2/debian/patches/base_symbols.patch
--- 0.17.0-11/debian/patches/base_symbols.patch	2022-07-18 13:06:02.000000000 +0000
+++ 1.0.0-2/debian/patches/base_symbols.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,55 +0,0 @@
-Description: <short summary of the patch>
- TODO: Put a short summary on the line above and replace this paragraph
- with a longer explanation of this change. Complete the meta-information
- with other relevant fields (see below for details). To make it easier, the
- information below has been extracted from the changelog. Adjust it or drop
- it.
- .
- highway (0.17.0-1) UNRELEASED; urgency=medium
- .
-   * New upstream version 0.17.0
-Author: Mathieu Malaterre <malat@debian.org>
-
----
-The information above should follow the Patch Tagging Guidelines, please
-checkout http://dep.debian.net/deps/dep3/ to learn about the format. Here
-are templates for supplementary fields that you might want to add:
-
-Origin: <vendor|upstream|other>, <url of original patch>
-Bug: <url in upstream bugtracker>
-Bug-Debian: https://bugs.debian.org/<bugnumber>
-Bug-Ubuntu: https://launchpad.net/bugs/<bugnumber>
-Forwarded: <no|not-needed|url proving that it has been forwarded>
-Reviewed-By: <name and email of someone who approved the patch>
-Last-Update: 2022-06-21
-
---- highway-0.17.0.orig/CMakeLists.txt
-+++ highway-0.17.0/CMakeLists.txt
-@@ -281,6 +281,13 @@ target_include_directories(hwy_contrib P
-     $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
-     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
- target_compile_features(hwy_contrib PUBLIC cxx_std_11)
-+set_target_properties(hwy_contrib PROPERTIES
-+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
-+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
-+if(UNIX AND NOT APPLE)
-+  set_property(TARGET hwy_contrib APPEND_STRING PROPERTY
-+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
-+endif()
- endif()  # HWY_ENABLE_CONTRIB
- 
- add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
-@@ -292,6 +299,13 @@ target_include_directories(hwy_test PUBL
-     $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
-     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
- target_compile_features(hwy_test PUBLIC cxx_std_11)
-+set_target_properties(hwy_test PROPERTIES
-+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
-+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
-+if(UNIX AND NOT APPLE)
-+  set_property(TARGET hwy_test APPEND_STRING PROPERTY
-+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
-+endif()
- 
- # -------------------------------------------------------- hwy_list_targets
- # Generate a tool to print the compiled-in targets as defined by the current
diff -pruN 0.17.0-11/debian/patches/neon.patch 1.0.0-2/debian/patches/neon.patch
--- 0.17.0-11/debian/patches/neon.patch	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/debian/patches/neon.patch	2022-08-01 07:30:19.000000000 +0000
@@ -0,0 +1,16 @@
+Description: Run a full test suite on neon
+Author: Mathieu Malaterre <malat@debian.org>
+Forwarded: no
+Last-Update: 2022-07-07
+
+--- highway-0.17.1~git20220707.b0108ff.orig/hwy/tests/convert_test.cc
++++ highway-0.17.1~git20220707.b0108ff/hwy/tests/convert_test.cc
+@@ -391,7 +391,7 @@ struct TestIntFromFloatHuge {
+     // Still does not work, although ARMv7 manual says that float->int
+     // saturates, i.e. chooses the nearest representable value. Also causes
+     // out-of-memory for MSVC.
+-#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC
++#if !HWY_COMPILER_MSVC
+     using TI = MakeSigned<TF>;
+     const Rebind<TI, DF> di;
+ 
diff -pruN 0.17.0-11/debian/patches/revert_emu128.patch 1.0.0-2/debian/patches/revert_emu128.patch
--- 0.17.0-11/debian/patches/revert_emu128.patch	2022-07-18 13:39:52.000000000 +0000
+++ 1.0.0-2/debian/patches/revert_emu128.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,28 +0,0 @@
-Description: EMU128 and gcc-12 do not work well
-Author: Mathieu Malaterre <malat@debian.org>
-Bug-Debian: https://bugs.debian.org/1015256
-Forwarded: not-needed
-Last-Update: 2022-07-18
-
---- highway-0.17.0.orig/hwy/detect_targets.h
-+++ highway-0.17.0/hwy/detect_targets.h
-@@ -168,7 +168,7 @@
- #if defined(HWY_COMPILE_ONLY_SCALAR)
- #define HWY_BASELINE_SCALAR HWY_SCALAR
- #else
--#define HWY_BASELINE_SCALAR HWY_EMU128
-+#define HWY_BASELINE_SCALAR HWY_SCALAR
- #endif
- 
- // Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
---- highway-0.17.0.orig/hwy/targets.cc
-+++ highway-0.17.0/hwy/targets.cc
-@@ -253,7 +253,7 @@ HWY_DLLEXPORT uint32_t SupportedTargets(
- #if defined(HWY_COMPILE_ONLY_SCALAR)
-   bits = HWY_SCALAR;
- #else
--  bits = HWY_EMU128;
-+  bits = HWY_SCALAR;
- #endif
- 
- #if HWY_ARCH_X86
diff -pruN 0.17.0-11/debian/patches/series 1.0.0-2/debian/patches/series
--- 0.17.0-11/debian/patches/series	2022-07-20 15:20:34.000000000 +0000
+++ 1.0.0-2/debian/patches/series	2022-08-01 07:30:31.000000000 +0000
@@ -1,7 +1 @@
-base_symbols.patch
-a1302c69192cfbea44cd5330ac828d40fba958e2.patch
-a80e816ef9db9de04239f6ad3db4f4db991a544f.patch
-4429b67a81bee50003e9c004e0c63581252e5274.patch
-94b156ebbd9eb40913366238d96138669ca3f80b.patch
-revert_emu128.patch
-46cac7ed326bb6c49fb3f3f572dd94161bc62ee8.patch
+neon.patch
diff -pruN 0.17.0-11/debian/rules 1.0.0-2/debian/rules
--- 0.17.0-11/debian/rules	2022-07-18 13:41:01.000000000 +0000
+++ 1.0.0-2/debian/rules	2022-08-01 07:29:20.000000000 +0000
@@ -9,6 +9,9 @@ export DPKG_GENSYMBOLS_CHECK_LEVEL = 4
 %:
 	dh $@ --buildsystem=cmake+ninja
 
+# Prefer SCALAR over EMU128 for now, see #1015256
+DEB_CXXFLAGS_MAINT_APPEND += -DHWY_BROKEN_EMU128=1
+
 ifneq (,$(filter $(DEB_BUILD_ARCH), armhf armel))
   # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
   DEB_CXXFLAGS_MAINT_APPEND += -Wno-psabi
diff -pruN 0.17.0-11/debian/watch 1.0.0-2/debian/watch
--- 0.17.0-11/debian/watch	2022-07-18 13:06:02.000000000 +0000
+++ 1.0.0-2/debian/watch	2022-08-01 07:29:20.000000000 +0000
@@ -1,3 +1,3 @@
 version=4
-opts=filenamemangle=s/.+\/v?(\d\S+)\.tar\.gz/libjxl-$1\.tar\.gz/ \
+opts=filenamemangle=s/.+\/v?(\d\S+)\.tar\.gz/highway-$1\.tar\.gz/ \
   https://github.com/google/highway/tags .*/v?(\d\S+)\.tar\.gz
diff -pruN 0.17.0-11/g3doc/impl_details.md 1.0.0-2/g3doc/impl_details.md
--- 0.17.0-11/g3doc/impl_details.md	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/g3doc/impl_details.md	2022-07-27 11:48:16.000000000 +0000
@@ -73,11 +73,10 @@ for the next target. This is accomplishe
 translation unit, which may in turn `#include` one or more `-inl.h` files. As an
 exception, `hwy/ops/*-inl.h` do not require include guards because they are all
 included from highway.h, which takes care of this in a single location. Note
-that platforms such as WASM and RISC-V which currently only offer a single
-target do not require multiple compilation, but the same mechanism is used
-without actually re-#including. For both of those platforms, it is possible that
-additional targets will later be added, which means this mechanism will then be
-required.
+that platforms such as RISC-V which currently only offer a single target do not
+require multiple compilation, but the same mechanism is used without actually
+re-#including. For both of those platforms, it is possible that additional
+targets will later be added, which means this mechanism will then be required.
 
 Instead of a -inl.h file, you can also use a normal .cc/.h component, where the
 vector code is hidden inside the .cc file, and the header only declares a normal
@@ -136,6 +135,54 @@ which are shared between all targets. Th
 [x86_128-inl.h](../hwy/ops/x86_128-inl.h) and are also templated on the
 vector type.
 
+## Adding a new op
+
+Adding an op consists of three steps, listed below. As an example, consider
+https://github.com/google/highway/commit/6c285d64ae50e0f48866072ed3a476fc12df5ab6.
+
+1) Document the new op in `g3doc/quick_reference.md` with its function signature
+and a description of what the op does.
+
+2) Implement the op in each `ops/*-inl.h` header. There are two exceptions,
+detailed in the previous section: first, `generic_ops-inl.h` is not changed in
+the common case where the op has a unique definition for every target. Second,
+if the op's definition would be duplicated in `x86_256-inl.h` and
+`x86_512-inl.h`, it may be expressed as a template in `x86_128-inl.h` with a
+`class V` template argument, e.g. `TableLookupBytesOr0`.
+
+3) Pick the appropriate `hwy/tests/*_test.cc` and add a test. This is also a
+three step process: first define a functor that implements the test logic (e.g.
+`TestPlusMinus`), then a function (e.g. `TestAllPlusMinus`) that invokes this
+functor for all lane types the op supports, and finally a line near the end of
+the file that invokes the function for all targets:
+`HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);`. Note the naming
+convention that the function has the same name as the functor except for the
+`TestAll` prefix.
+
+## Documentation of platform-specific intrinsics
+
+When adding a new op, it is often necessary to consult the reference for each
+platform's intrinsics.
+
+For x86 targets `HWY_SSSE3`, `HWY_SSE4`, `HWY_AVX2`, `HWY_AVX3`, `HWY_AVX3_DL`
+Intel provides a
+[searchable reference](https://www.intel.com/content/www/us/en/docs/intrinsics-guide).
+
+For Arm targets `HWY_NEON`, `HWY_SVE` (plus its specialization for 256-bit
+vectors `HWY_SVE_256`), `HWY_SVE2` (plus its specialization for 128-bit vectors
+`HWY_SVE2_128`), Arm provides a
+[searchable reference](https://developer.arm.com/architectures/instruction-sets/intrinsics).
+
+For RISC-V target `HWY_RVV`, we refer to the assembly language
+[specification](https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc)
+plus the separate
+[intrinsics specification](https://github.com/riscv-non-isa/rvv-intrinsic-doc).
+
+For WebAssembly target `HWY_WASM`, we recommend consulting the
+[intrinsics header](https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/wasm_simd128.h).
+There is also an unofficial
+[searchable list of intrinsics](https://nemequ.github.io/waspr/intrinsics).
+
 ## Why scalar target
 
 There can be various reasons to avoid using vector intrinsics:
@@ -157,6 +204,9 @@ guarantees 16-byte vectors are available
 supports all ops. Both of these alternatives are slower than native vector code,
 but they allow testing your code even when actual vectors are unavailable.
 
-`HWY_SCALAR` is only enabled/used `#ifdef HWY_COMPILE_ONLY_SCALAR`. Projects
-that intend to use it may require `#if HWY_TARGET != HWY_SCALAR` around the ops
-it does not support to prevent compile errors.
+One of the above targets is used if the CPU does not support any actual SIMD
+target. To avoid compiling any intrinsics, define `HWY_COMPILE_ONLY_EMU128`.
+
+`HWY_SCALAR` is only enabled/used `#ifdef HWY_COMPILE_ONLY_SCALAR` (or `#if
+HWY_BROKEN_EMU128`). Projects that intend to use it may require `#if HWY_TARGET
+!= HWY_SCALAR` around the ops it does not support to prevent compile errors.
diff -pruN 0.17.0-11/g3doc/quick_reference.md 1.0.0-2/g3doc/quick_reference.md
--- 0.17.0-11/g3doc/quick_reference.md	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/g3doc/quick_reference.md	2022-07-27 11:48:16.000000000 +0000
@@ -10,7 +10,7 @@ using platform-specific intrinsics, whic
 `hwy/contrib` also includes higher-level algorithms such as `FindIf` or `Sorter`
 implemented using these ops.
 
-Highyway can use dynamic dispatch, which chooses the best available
+Highway can use dynamic dispatch, which chooses the best available
 implementation at runtime, or static dispatch which has no runtime overhead.
 Dynamic dispatch works by compiling your code once per target CPU and then
 selecting (via indirect call) at runtime.
@@ -103,9 +103,22 @@ of type `D` and return an actual vector
 The actual lane count (used to increment loop counters etc.) can be obtained via
 `Lanes(d)`. This value might not be known at compile time, thus storage for
 vectors should be dynamically allocated, e.g. via `AllocateAligned(Lanes(d))`.
-Note that `Lanes(d)` could potentially change at runtime, upon user request via
-special CPU instructions. Thus we discourage caching the result; it is typically
-used inside a function or basic block.
+
+Note that `Lanes(d)` could potentially change at runtime. This is currently
+unlikely, and will not be initiated by Highway without user action, but could
+still happen in other circumstances:
+
+*   upon user request in future via special CPU instructions (switching to
+    'streaming SVE' mode for Arm SME), or
+*   via system software (`prctl(PR_SVE_SET_VL` on Linux for Arm SVE). When the
+    vector length is changed using this mechanism, all but the lower 128 bits of
+    vector registers are invalidated.
+
+Thus we discourage caching the result; it is typically used inside a function or
+basic block. If the application anticipates that one of the above circumstances
+could happen, it should ensure by some out-of-band mechanism that such changes
+will not happen during the critical section (the vector code which uses the
+result of the previously obtained `Lanes(d)`).
 
 `MaxLanes(d)` returns a (potentially loose) upper bound on `Lanes(d)`, and is
 implemented as a constexpr function.
@@ -262,9 +275,10 @@ code, in descending order of preference:
 -   `using hwy::HWY_NAMESPACE;` directive. This is generally discouraged,
     especially for SIMD code residing in a header.
 
-Note that overloaded operators are not yet supported on RVV and SVE; code that
-wishes to run on all targets until that is resolved can use the corresponding
-equivalents functions such as `Eq`, `Lt`, `Add`, `Div` etc.
+Note that overloaded operators are not yet supported on RVV and SVE. Until that
+is resolved, code that wishes to run on all targets must use the corresponding
+equivalents mentioned in the description of each overloaded operator, for
+example `Lt` instead of `operator<`.
 
 ### Initialization
 
@@ -379,6 +393,14 @@ All other ops in this section are only a
     128-bit values, each stored as an adjacent pair of 64-bit lanes (e.g.
     indices 1 and 0, where 0 is the least-significant 64-bits).
 
+*   `V`: `u64` \
+    <code>M **Min128Upper**(D, V a, V b)</code>: for each 128-bit key-value
+    pair, returns `a` if it is considered less than `b` by Lt128Upper, else `b`.
+
+*   `V`: `u64` \
+    <code>M **Max128Upper**(D, V a, V b)</code>: for each 128-bit key-value
+    pair, returns `a` if it is considered > `b` by Lt128Upper, else `b`.
+
 #### Multiply
 
 *   `V`: `{u,i}{16,32}` \
@@ -663,6 +685,16 @@ false is zero, true has all bits set:
     is already a mask, e.g. returned by a comparison.
 
 *   `V`: `{u,i,f}{16,32,64}` \
+    <code>V **CompressNot**(V v, M m)</code>: equivalent to `Compress(v,
+    Not(m))` but possibly faster if `CompressIsPartition<T>::value` is true.
+
+*   `V`: `u64` \
+    <code>V **CompressBlocksNot**(V v, M m)</code>: equivalent to
+    `CompressNot(v, m)` when `m` is structured as adjacent pairs (both true or
+    false), e.g. as returned by `Lt128`. This is a no-op for 128 bit vectors.
+    Unavailable if `HWY_TARGET == HWY_SCALAR`.
+
+*   `V`: `{u,i,f}{16,32,64}` \
     <code>size_t **CompressStore**(V v, M m, D d, T* p)</code>: writes lanes
     whose mask `m` is true into `p`, starting from lane 0. Returns `CountTrue(d,
     m)`, the number of valid lanes. May be implemented as `Compress` followed by
@@ -723,17 +755,32 @@ These return a mask (see above) indicati
     lanes (e.g. indices 1,0), returns whether a[1]:a[0] concatenated to an
     unsigned 128-bit integer (least significant bits in a[0]) is less than
     b[1]:b[0]. For each pair, the mask lanes are either both true or both false.
-    Only available if `HWY_TARGET != HWY_SCALAR`.
+    Unavailable if `HWY_TARGET == HWY_SCALAR`.
+
+*   `V`: `u64` \
+    <code>M **Lt128Upper**(D, V a, V b)</code>: for each adjacent pair of 64-bit
+    lanes (e.g. indices 1,0), returns whether a[1] is less than b[1]. For each
+    pair, the mask lanes are either both true or both false. This is useful for
+    comparing 64-bit keys alongside 64-bit values. Only available if `HWY_TARGET
+    != HWY_SCALAR`.
 
 ### Memory
 
 Memory operands are little-endian, otherwise their order would depend on the
 lane configuration. Pointers are the addresses of `N` consecutive `T` values,
-either naturally-aligned (`aligned`) or possibly unaligned (`p`).
+either `aligned` (address is a multiple of the vector size) or possibly
+unaligned (denoted `p`).
+
+Even unaligned addresses must still be a multiple of `sizeof(T)`, otherwise
+`StoreU` may crash on some platforms (e.g. RVV and ARMv7). Note that C++ ensures
+automatic (stack) and dynamically allocated (via `new` or `malloc`) variables of
+type `T` are aligned to `sizeof(T)`, hence such addresses are suitable for
+`StoreU`. However, casting pointers to `char*` and adding arbitrary offsets (not
+a multiple of `sizeof(T)`) can violate this requirement.
 
 **Note**: computations with low arithmetic intensity (FLOP/s per memory traffic
 bytes), e.g. dot product, can be *1.5 times as fast* when the memory operands
-are naturally aligned. An unaligned access may require two load ports.
+are aligned to the vector size. An unaligned access may require two load ports.
 
 #### Load
 
@@ -802,11 +849,12 @@ F(src[tbl[i]])` because `Scatter` is mor
 #### Store
 
 *   <code>void **Store**(Vec&lt;D&gt; v, D, T* aligned)</code>: copies `v[i]`
-    into `aligned[i]`, which must be naturally aligned. Writes exactly `N *
-    sizeof(T)` bytes.
+    into `aligned[i]`, which must be aligned to the vector size. Writes exactly
+    `N * sizeof(T)` bytes.
 
-*   <code>void **StoreU**(Vec&lt;D&gt; v, D, T* p)</code>: as `Store`, but
-    without the alignment requirement.
+*   <code>void **StoreU**(Vec&lt;D&gt; v, D, T* p)</code>: as `Store`, but the
+    alignment requirement is relaxed to element-aligned (multiple of
+    `sizeof(T)`).
 
 *   <code>void **BlendedStore**(Vec&lt;D&gt; v, M m, D d, T* p)</code>: as
     `StoreU`, but only updates `p` where `m` is true. May fault even where
@@ -814,7 +862,7 @@ F(src[tbl[i]])` because `Scatter` is mor
     cannot happen unless the entire vector is inaccessible. Equivalent to, and
     potentially more efficient than, `StoreU(IfThenElse(m, v, LoadU(d, p)), d,
     p)`. "Blended" indicates this may not be atomic; other threads must not
-    concurrently update `[p, p + Lanes(d))` without sychronization.
+    concurrently update `[p, p + Lanes(d))` without synchronization.
 
 *   <code>void **SafeFillN**(size_t num, T value, D d, T* HWY_RESTRICT
     to)</code>: Sets `to[0, num)` to `value`. If `num` exceeds `Lanes(d)`, the
@@ -851,8 +899,8 @@ All functions except `Stream` are define
     write-only data; avoids cache pollution). May be implemented using a
     CPU-internal buffer. To avoid partial flushes and unpredictable interactions
     with atomics (for example, see Intel SDM Vol 4, Sec. 8.1.2.2), call this
-    consecutively for an entire naturally aligned cache line (typically 64
-    bytes). Each call may write a multiple of `HWY_STREAM_MULTIPLE` bytes, which
+    consecutively for an entire cache line (typically 64 bytes, aligned to its
+    size). Each call may write a multiple of `HWY_STREAM_MULTIPLE` bytes, which
     can exceed `Lanes(d) * sizeof(T)`. The new contents of `aligned` may not be
     visible until `FlushStream` is called.
 
@@ -902,6 +950,16 @@ All functions except `Stream` are define
     <code>V8 **U8FromU32**(V)</code>: special-case `u32` to `u8` conversion when
     all lanes of `V` are already clamped to `[0, 256)`.
 
+*   `D`,`V`: (`u64,u32`), (`u64,u16`), (`u64,u8`), (`u32,u16`), (`u32,u8`), \
+    (`u16,u8`) <code>Vec&lt;D&gt; **TruncateTo**(D, V v)</code>: returns `v[i]`
+    truncated to the smaller type indicated by `T = TFromD<D>`, with the same
+    result as if the more-signficant input bits that do not fit in `T` had been
+    zero. Example: ```
+ScalableTag<uint32_t> du32;
+Rebind<uint8_t> du8;
+TruncateTo(du8, Set(du32, 0xF08F))
+    ``` is the same as `Set(du8, 0x8F)`.
+
 `DemoteTo` and float-to-int `ConvertTo` return the closest representable value
 if the input exceeds the destination range.
 
@@ -1143,7 +1201,7 @@ instead because they are more general:
     <code>V **Reverse**(D, V a)</code> returns a vector with lanes in reversed
     order (`out[i] == a[Lanes(D()) - 1 - i]`).
 
-The following `ReverseN` must not be called if `Lanes(D()) > N`:
+The following `ReverseN` must not be called if `Lanes(D()) < N`:
 
 *   `V`: `{u,i,f}{16,32,64}` \
     <code>V **Reverse2**(D, V a)</code> returns a vector with each group of 2
@@ -1194,19 +1252,19 @@ Ops in this section are only available i
 
 *   `V`: `u8` \
     <code>V **AESRound**(V state, V round_key)</code>: one round of AES
-    encrytion: `MixColumns(SubBytes(ShiftRows(state))) ^ round_key`. This
+    encryption: `MixColumns(SubBytes(ShiftRows(state))) ^ round_key`. This
     matches x86 AES-NI. The latency is independent of the input values.
 
 *   `V`: `u8` \
     <code>V **AESLastRound**(V state, V round_key)</code>: the last round of AES
-    encrytion: `SubBytes(ShiftRows(state)) ^ round_key`. This matches x86
+    encryption: `SubBytes(ShiftRows(state)) ^ round_key`. This matches x86
     AES-NI. The latency is independent of the input values.
 
 *   `V`: `u64` \
     <code>V **CLMulLower**(V a, V b)</code>: carryless multiplication of the
     lower 64 bits of each 128-bit block into a 128-bit product. The latency is
     independent of the input values (assuming that is true of normal integer
-    multiplication) so this can safely be used in cryto. Applications that wish
+    multiplication) so this can safely be used in crypto. Applications that wish
     to multiply upper with lower halves can `Shuffle01` one of the operands; on
     x86 that is expected to be latency-neutral.
 
@@ -1250,13 +1308,13 @@ The above were previously known as `HWY_
     when attempting to load lanes from unmapped memory, even if the
     corresponding mask element is false. This is the case on ASAN/MSAN builds,
     AMD x86 prior to AVX-512, and ARM NEON. If so, users can prevent faults by
-    ensuring memory addresses are naturally aligned or at least padded
+    ensuring memory addresses are aligned to the vector size or at least padded
     (allocation size increased by at least `Lanes(d)`.
 
 *   `HWY_NATIVE_FMA` expands to 1 if the `MulAdd` etc. ops use native fused
-    multiply-add. Otherwise, `MulAdd(f, m, a)` is implemented as
-    `Add(Mul(f, m), a)`. Checking this can be useful for increasing the
-    tolerance of expected results (around 1E-5 or 1E-6).
+    multiply-add. Otherwise, `MulAdd(f, m, a)` is implemented as `Add(Mul(f, m),
+    a)`. Checking this can be useful for increasing the tolerance of expected
+    results (around 1E-5 or 1E-6).
 
 The following were used to signal the maximum number of lanes for certain
 operations, but this is no longer necessary (nor possible on SVE/RVV), so they
@@ -1267,10 +1325,11 @@ are DEPRECATED:
 
 ## Detecting supported targets
 
-`SupportedTargets()` returns a cached (initialized on-demand) bitfield of the
-targets supported on the current CPU, detected using CPUID on x86 or equivalent.
-This may include targets that are not in `HWY_TARGETS`, and vice versa. If
-there is no overlap the binary will likely crash. This can only happen if:
+`SupportedTargets()` returns a non-cached (re-initialized on each call) bitfield
+of the targets supported on the current CPU, detected using CPUID on x86 or
+equivalent. This may include targets that are not in `HWY_TARGETS`, and vice
+versa. If there is no overlap the binary will likely crash. This can only happen
+if:
 
 *   the specified baseline is not supported by the current CPU, which
     contradicts the definition of baseline, so the configuration is invalid; or
@@ -1309,13 +1368,19 @@ as an expression, e.g. `-DHWY_DISABLED_T
 Zero or one of the following macros may be defined to replace the default
 policy for selecting `HWY_TARGETS`:
 
-*   `HWY_COMPILE_ONLY_SCALAR` selects only `HWY_SCALAR`, which disables SIMD.
+*   `HWY_COMPILE_ONLY_EMU128` selects only `HWY_EMU128`, which avoids intrinsics
+    but implements all ops using standard C++.
+*   `HWY_COMPILE_ONLY_SCALAR` selects only `HWY_SCALAR`, which implements
+    single-lane-only ops using standard C++.
 *   `HWY_COMPILE_ONLY_STATIC` selects only `HWY_STATIC_TARGET`, which
     effectively disables dynamic dispatch.
 *   `HWY_COMPILE_ALL_ATTAINABLE` selects all attainable targets (i.e. enabled
     and permitted by the compiler, independently of autovectorization), which
-    maximizes coverage in tests. This may also be defined even if one of
-    `HWY_COMPILE_ONLY_*` is, but will then be ignored.
+    maximizes coverage in tests.
+
+At most one `HWY_COMPILE_ONLY_*` may be defined. `HWY_COMPILE_ALL_ATTAINABLE`
+may also be defined even if one of `HWY_COMPILE_ONLY_*` is, but will then be
+ignored.
 
 If none are defined, but `HWY_IS_TEST` is defined, the default is
 `HWY_COMPILE_ALL_ATTAINABLE`. Otherwise, the default is to select all attainable
@@ -1343,6 +1408,19 @@ constant-propagation issues with Clang o
 *   `SizeTag<N>` is an empty struct, used to select overloaded functions
     appropriate for `N` bytes.
 
+*   `MakeUnsigned<T>` is an alias for an unsigned type of the same size as `T`.
+
+*   `MakeSigned<T>` is an alias for a signed type of the same size as `T`.
+
+*   `MakeFloat<T>` is an alias for a floating-point type of the same size as
+    `T`.
+
+*   `MakeWide<T>` is an alias for a type with twice the size of `T` and the same
+    category (unsigned/signed/float).
+
+*   `MakeNarrow<T>` is an alias for a type with half the size of `T` and the
+    same category (unsigned/signed/float).
+
 ## Memory allocation
 
 `AllocateAligned<T>(items)` returns a unique pointer to newly allocated memory
diff -pruN 0.17.0-11/hwy/aligned_allocator_test.cc 1.0.0-2/hwy/aligned_allocator_test.cc
--- 0.17.0-11/hwy/aligned_allocator_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/aligned_allocator_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -23,7 +23,6 @@
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "hwy/base.h"
 
 namespace {
 
diff -pruN 0.17.0-11/hwy/base.h 1.0.0-2/hwy/base.h
--- 0.17.0-11/hwy/base.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/base.h	2022-07-27 11:48:16.000000000 +0000
@@ -229,19 +229,17 @@ static constexpr HWY_MAYBE_UNUSED size_t
 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
 // by concatenating base type and bits.
 
-#if HWY_ARCH_ARM && (__ARM_FP & 2)
-#define HWY_NATIVE_FLOAT16 1
-#else
-#define HWY_NATIVE_FLOAT16 0
-#endif
-
 #pragma pack(push, 1)
 
-#if HWY_NATIVE_FLOAT16
+// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
+// always supported on aarch64, for v7 only if -mfp16-format is given.
+#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
 using float16_t = __fp16;
-// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
-// arguments, so use a wrapper.
-// TODO(janwas): replace with _Float16 when that is supported?
+// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
+// Required for Clang RVV if the float16 extension is used.
+#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
+using float16_t = _Float16;
+// Otherwise emulate
 #else
 struct float16_t {
   uint16_t bits;
@@ -257,6 +255,44 @@ struct bfloat16_t {
 using float32_t = float;
 using float64_t = double;
 
+#pragma pack(push, 1)
+
+// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
+// https://reviews.llvm.org/D86310
+struct alignas(16) uint128_t {
+  uint64_t lo;  // little-endian layout
+  uint64_t hi;
+};
+
+// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
+// field is to be compared (Lt128Upper instead of Lt128).
+struct alignas(16) K64V64 {
+  uint64_t value;  // little-endian layout
+  uint64_t key;
+};
+
+#pragma pack(pop)
+
+static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
+                                              const uint128_t& b) {
+  return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
+                                              const uint128_t& b) {
+  return b < a;
+}
+
+static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
+                                              const K64V64& b) {
+  return a.key < b.key;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
+                                              const K64V64& b) {
+  return b < a;
+}
+
 //------------------------------------------------------------------------------
 // Controlling overload resolution (SFINAE)
 
@@ -309,6 +345,8 @@ HWY_API constexpr bool IsSame() {
   hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
 #define HWY_IF_NOT_LANE_SIZE(T, bytes) \
   hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
+#define HWY_IF_LANE_SIZE_LT(T, bytes) \
+  hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
 
 #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
   hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
@@ -341,12 +379,16 @@ struct Relations<uint8_t> {
   using Unsigned = uint8_t;
   using Signed = int8_t;
   using Wide = uint16_t;
+  enum { is_signed = 0 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<int8_t> {
   using Unsigned = uint8_t;
   using Signed = int8_t;
   using Wide = int16_t;
+  enum { is_signed = 1 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<uint16_t> {
@@ -354,6 +396,8 @@ struct Relations<uint16_t> {
   using Signed = int16_t;
   using Wide = uint32_t;
   using Narrow = uint8_t;
+  enum { is_signed = 0 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<int16_t> {
@@ -361,6 +405,8 @@ struct Relations<int16_t> {
   using Signed = int16_t;
   using Wide = int32_t;
   using Narrow = int8_t;
+  enum { is_signed = 1 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<uint32_t> {
@@ -369,6 +415,8 @@ struct Relations<uint32_t> {
   using Float = float;
   using Wide = uint64_t;
   using Narrow = uint16_t;
+  enum { is_signed = 0 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<int32_t> {
@@ -377,13 +425,18 @@ struct Relations<int32_t> {
   using Float = float;
   using Wide = int64_t;
   using Narrow = int16_t;
+  enum { is_signed = 1 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<uint64_t> {
   using Unsigned = uint64_t;
   using Signed = int64_t;
   using Float = double;
+  using Wide = uint128_t;
   using Narrow = uint32_t;
+  enum { is_signed = 0 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<int64_t> {
@@ -391,6 +444,15 @@ struct Relations<int64_t> {
   using Signed = int64_t;
   using Float = double;
   using Narrow = int32_t;
+  enum { is_signed = 1 };
+  enum { is_float = 0 };
+};
+template <>
+struct Relations<uint128_t> {
+  using Unsigned = uint128_t;
+  using Narrow = uint64_t;
+  enum { is_signed = 0 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<float16_t> {
@@ -398,12 +460,16 @@ struct Relations<float16_t> {
   using Signed = int16_t;
   using Float = float16_t;
   using Wide = float;
+  enum { is_signed = 1 };
+  enum { is_float = 1 };
 };
 template <>
 struct Relations<bfloat16_t> {
   using Unsigned = uint16_t;
   using Signed = int16_t;
   using Wide = float;
+  enum { is_signed = 1 };
+  enum { is_float = 1 };
 };
 template <>
 struct Relations<float> {
@@ -412,6 +478,8 @@ struct Relations<float> {
   using Float = float;
   using Wide = double;
   using Narrow = float16_t;
+  enum { is_signed = 1 };
+  enum { is_float = 1 };
 };
 template <>
 struct Relations<double> {
@@ -419,6 +487,8 @@ struct Relations<double> {
   using Signed = int64_t;
   using Float = double;
   using Narrow = float;
+  enum { is_signed = 1 };
+  enum { is_float = 1 };
 };
 
 template <size_t N>
@@ -445,6 +515,10 @@ struct TypeFromSize<8> {
   using Signed = int64_t;
   using Float = double;
 };
+template <>
+struct TypeFromSize<16> {
+  using Unsigned = uint128_t;
+};
 
 }  // namespace detail
 
@@ -470,6 +544,24 @@ using SignedFromSize = typename detail::
 template <size_t N>
 using FloatFromSize = typename detail::TypeFromSize<N>::Float;
 
+// Avoid confusion with SizeTag where the parameter is a lane size.
+using UnsignedTag = SizeTag<0>;
+using SignedTag = SizeTag<0x100>;  // integer
+using FloatTag = SizeTag<0x200>;
+
+template <typename T, class R = detail::Relations<T>>
+constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
+  return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>();
+}
+
+// For when we only want to distinguish FloatTag from everything else.
+using NonFloatTag = SizeTag<0x400>;
+
+template <typename T, class R = detail::Relations<T>>
+constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
+  return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
+}
+
 //------------------------------------------------------------------------------
 // Type traits
 
@@ -640,7 +732,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonze
 #else   // HWY_ARCH_X86_64
   // _BitScanForward64 not available
   uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
-  unsigned long index;
+  unsigned long index;  // NOLINT
   if (lsb == 0) {
     uint32_t msb = static_cast<uint32_t>(x >> 32u);
     _BitScanForward(&index, msb);
@@ -675,7 +767,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonze
 #else   // HWY_ARCH_X86_64
   // _BitScanReverse64 not available
   const uint32_t msb = static_cast<uint32_t>(x >> 32u);
-  unsigned long index;
+  unsigned long index;  // NOLINT
   if (msb == 0) {
     const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
     _BitScanReverse(&index, lsb);
@@ -691,7 +783,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonze
 }
 
 HWY_API size_t PopCount(uint64_t x) {
-#if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
+#if HWY_COMPILER_GCC  // includes clang
   return static_cast<size_t>(__builtin_popcountll(x));
   // This instruction has a separate feature flag, but is often called from
   // non-SIMD code, so we don't want to require dynamic dispatch. It was first
@@ -700,7 +792,8 @@ HWY_API size_t PopCount(uint64_t x) {
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
   return _mm_popcnt_u64(x);
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
-  return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
+  return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
+         _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
 #else
   x -= ((x >> 1) & 0x5555555555555555ULL);
   x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
diff -pruN 0.17.0-11/hwy/base_test.cc 1.0.0-2/hwy/base_test.cc
--- 0.17.0-11/hwy/base_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/base_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -22,7 +22,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "base_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -31,25 +31,26 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 
 HWY_NOINLINE void TestAllLimits() {
-  HWY_ASSERT_EQ(uint8_t(0), LimitsMin<uint8_t>());
-  HWY_ASSERT_EQ(uint16_t(0), LimitsMin<uint16_t>());
-  HWY_ASSERT_EQ(uint32_t(0), LimitsMin<uint32_t>());
-  HWY_ASSERT_EQ(uint64_t(0), LimitsMin<uint64_t>());
-
-  HWY_ASSERT_EQ(int8_t(-128), LimitsMin<int8_t>());
-  HWY_ASSERT_EQ(int16_t(-32768), LimitsMin<int16_t>());
-  HWY_ASSERT_EQ(int32_t(0x80000000u), LimitsMin<int32_t>());
-  HWY_ASSERT_EQ(int64_t(0x8000000000000000ull), LimitsMin<int64_t>());
-
-  HWY_ASSERT_EQ(uint8_t(0xFF), LimitsMax<uint8_t>());
-  HWY_ASSERT_EQ(uint16_t(0xFFFF), LimitsMax<uint16_t>());
-  HWY_ASSERT_EQ(uint32_t(0xFFFFFFFFu), LimitsMax<uint32_t>());
-  HWY_ASSERT_EQ(uint64_t(0xFFFFFFFFFFFFFFFFull), LimitsMax<uint64_t>());
-
-  HWY_ASSERT_EQ(int8_t(0x7F), LimitsMax<int8_t>());
-  HWY_ASSERT_EQ(int16_t(0x7FFF), LimitsMax<int16_t>());
-  HWY_ASSERT_EQ(int32_t(0x7FFFFFFFu), LimitsMax<int32_t>());
-  HWY_ASSERT_EQ(int64_t(0x7FFFFFFFFFFFFFFFull), LimitsMax<int64_t>());
+  HWY_ASSERT_EQ(uint8_t{0}, LimitsMin<uint8_t>());
+  HWY_ASSERT_EQ(uint16_t{0}, LimitsMin<uint16_t>());
+  HWY_ASSERT_EQ(uint32_t{0}, LimitsMin<uint32_t>());
+  HWY_ASSERT_EQ(uint64_t{0}, LimitsMin<uint64_t>());
+
+  HWY_ASSERT_EQ(int8_t{-128}, LimitsMin<int8_t>());
+  HWY_ASSERT_EQ(int16_t{-32768}, LimitsMin<int16_t>());
+  HWY_ASSERT_EQ(static_cast<int32_t>(0x80000000u), LimitsMin<int32_t>());
+  HWY_ASSERT_EQ(static_cast<int64_t>(0x8000000000000000ull),
+                LimitsMin<int64_t>());
+
+  HWY_ASSERT_EQ(uint8_t{0xFF}, LimitsMax<uint8_t>());
+  HWY_ASSERT_EQ(uint16_t{0xFFFF}, LimitsMax<uint16_t>());
+  HWY_ASSERT_EQ(uint32_t{0xFFFFFFFFu}, LimitsMax<uint32_t>());
+  HWY_ASSERT_EQ(uint64_t{0xFFFFFFFFFFFFFFFFull}, LimitsMax<uint64_t>());
+
+  HWY_ASSERT_EQ(int8_t{0x7F}, LimitsMax<int8_t>());
+  HWY_ASSERT_EQ(int16_t{0x7FFF}, LimitsMax<int16_t>());
+  HWY_ASSERT_EQ(int32_t{0x7FFFFFFFu}, LimitsMax<int32_t>());
+  HWY_ASSERT_EQ(int64_t{0x7FFFFFFFFFFFFFFFull}, LimitsMax<int64_t>());
 }
 
 struct TestLowestHighest {
@@ -89,6 +90,10 @@ HWY_NOINLINE void TestAllType() {
   ForUnsignedTypes(TestIsUnsigned());
   ForSignedTypes(TestIsSigned());
   ForFloatTypes(TestIsFloat());
+
+  static_assert(sizeof(MakeUnsigned<hwy::uint128_t>) == 16, "");
+  static_assert(sizeof(MakeWide<uint64_t>) == 16, "Expected uint128_t");
+  static_assert(sizeof(MakeNarrow<hwy::uint128_t>) == 8, "Expected uint64_t");
 }
 
 struct TestIsSame {
@@ -103,54 +108,54 @@ struct TestIsSame {
 HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); }
 
 HWY_NOINLINE void TestAllBitScan() {
-  HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
-  HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
-  HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(2u));
-  HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(3u));
-  HWY_ASSERT_EQ(size_t(31), Num0BitsAboveMS1Bit_Nonzero32(1u));
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(2u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(3u));
+  HWY_ASSERT_EQ(size_t{31}, Num0BitsAboveMS1Bit_Nonzero32(1u));
 
-  HWY_ASSERT_EQ(size_t(0),
+  HWY_ASSERT_EQ(size_t{0},
                 Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull));
-  HWY_ASSERT_EQ(size_t(0),
+  HWY_ASSERT_EQ(size_t{0},
                 Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(1),
+  HWY_ASSERT_EQ(size_t{1},
                 Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull));
-  HWY_ASSERT_EQ(size_t(1),
+  HWY_ASSERT_EQ(size_t{1},
                 Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull));
-  HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(2ull));
-  HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(3ull));
-  HWY_ASSERT_EQ(size_t(63), Num0BitsAboveMS1Bit_Nonzero64(1ull));
-
-  HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero32(1u));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero32(2u));
-  HWY_ASSERT_EQ(size_t(30), Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
-  HWY_ASSERT_EQ(size_t(31), Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
-
-  HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero64(1ull));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero64(2ull));
-  HWY_ASSERT_EQ(size_t(62),
+  HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(2ull));
+  HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(3ull));
+  HWY_ASSERT_EQ(size_t{63}, Num0BitsAboveMS1Bit_Nonzero64(1ull));
+
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero32(1u));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero32(2u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
+  HWY_ASSERT_EQ(size_t{31}, Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
+
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero64(1ull));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero64(2ull));
+  HWY_ASSERT_EQ(size_t{62},
                 Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull));
-  HWY_ASSERT_EQ(size_t(63),
+  HWY_ASSERT_EQ(size_t{63},
                 Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull));
 }
 
 HWY_NOINLINE void TestAllPopCount() {
-  HWY_ASSERT_EQ(size_t(0), PopCount(0u));
-  HWY_ASSERT_EQ(size_t(1), PopCount(1u));
-  HWY_ASSERT_EQ(size_t(1), PopCount(2u));
-  HWY_ASSERT_EQ(size_t(2), PopCount(3u));
-  HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000u));
-  HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFu));
-  HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFu));
-
-  HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000ull));
-  HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFull));
-  HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(33), PopCount(0x10FFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(63), PopCount(0xFFFEFFFFFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(64), PopCount(0xFFFFFFFFFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{0}, PopCount(0u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(1u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(2u));
+  HWY_ASSERT_EQ(size_t{2}, PopCount(3u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000u));
+  HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFu));
+  HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFu));
+
+  HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000ull));
+  HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFull));
+  HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{33}, PopCount(0x10FFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{63}, PopCount(0xFFFEFFFFFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull));
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff -pruN 0.17.0-11/hwy/cache_control.h 1.0.0-2/hwy/cache_control.h
--- 0.17.0-11/hwy/cache_control.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/cache_control.h	2022-07-27 11:48:16.000000000 +0000
@@ -51,10 +51,10 @@ namespace hwy {
 #define HWY_ATTR_CACHE
 #endif
 
-// Delays subsequent loads until prior loads are visible. On Intel CPUs, also
-// serves as a full fence (waits for all prior instructions to complete).
-// No effect on non-x86.
-// DEPRECATED due to differing behavior across architectures AND vendors.
+// Delays subsequent loads until prior loads are visible. Beware of potentially
+// differing behavior across architectures and vendors: on Intel but not
+// AMD CPUs, also serves as a full fence (waits for all prior instructions to
+// complete).
 HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
   _mm_lfence();
@@ -77,7 +77,7 @@ template <typename T>
 HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
   _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
-#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANG
+#elif HWY_COMPILER_GCC  // includes clang
   // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
   // desirable, so use the default 3 (keep in caches).
   __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
diff -pruN 0.17.0-11/hwy/contrib/algo/copy_test.cc 1.0.0-2/hwy/contrib/algo/copy_test.cc
--- 0.17.0-11/hwy/contrib/algo/copy_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/algo/copy_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -18,7 +18,7 @@
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/algo/copy-inl.h"
 #include "hwy/tests/test_util-inl.h"
diff -pruN 0.17.0-11/hwy/contrib/algo/find_test.cc 1.0.0-2/hwy/contrib/algo/find_test.cc
--- 0.17.0-11/hwy/contrib/algo/find_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/algo/find_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -23,7 +23,7 @@
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/algo/find-inl.h"
 #include "hwy/tests/test_util-inl.h"
diff -pruN 0.17.0-11/hwy/contrib/algo/transform_test.cc 1.0.0-2/hwy/contrib/algo/transform_test.cc
--- 0.17.0-11/hwy/contrib/algo/transform_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/algo/transform_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -19,8 +19,8 @@
 
 // clang-format off
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc"
-#include "hwy/foreach_target.h"
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc"  //NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/algo/transform-inl.h"
 #include "hwy/tests/test_util-inl.h"
diff -pruN 0.17.0-11/hwy/contrib/dot/dot_test.cc 1.0.0-2/hwy/contrib/dot/dot_test.cc
--- 0.17.0-11/hwy/contrib/dot/dot_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/dot/dot_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -22,7 +22,7 @@
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/dot/dot-inl.h"
 #include "hwy/tests/test_util-inl.h"
diff -pruN 0.17.0-11/hwy/contrib/image/image.cc 1.0.0-2/hwy/contrib/image/image.cc
--- 0.17.0-11/hwy/contrib/image/image.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/image/image.cc	2022-07-27 11:48:16.000000000 +0000
@@ -20,7 +20,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 
 HWY_BEFORE_NAMESPACE();
diff -pruN 0.17.0-11/hwy/contrib/image/image_test.cc 1.0.0-2/hwy/contrib/image/image_test.cc
--- 0.17.0-11/hwy/contrib/image/image_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/image/image_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -15,14 +15,7 @@
 
 #include "hwy/contrib/image/image.h"
 
-#include <cstddef>
-
-#include "hwy/base.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
-#include "hwy/foreach_target.h"
-
+#include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,6 +23,11 @@
 #include <random>
 #include <utility>
 
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target:
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
diff -pruN 0.17.0-11/hwy/contrib/math/math_test.cc 1.0.0-2/hwy/contrib/math/math_test.cc
--- 0.17.0-11/hwy/contrib/math/math_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/math/math_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -21,7 +21,7 @@
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/math/math-inl.h"
 #include "hwy/tests/test_util-inl.h"
diff -pruN 0.17.0-11/hwy/contrib/sort/algo-inl.h 1.0.0-2/hwy/contrib/sort/algo-inl.h
--- 0.17.0-11/hwy/contrib/sort/algo-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/algo-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -34,10 +34,11 @@
 #define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
 #define HAVE_PDQSORT 0
 #define HAVE_SORT512 0
+#define HAVE_VXSORT 0
 
 #if HAVE_AVX2SORT
 HWY_PUSH_ATTRIBUTES("avx2,avx")
-#include "avx2sort.h"
+#include "avx2sort.h"  //NOLINT
 HWY_POP_ATTRIBUTES
 #endif
 #if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
@@ -48,18 +49,59 @@ HWY_POP_ATTRIBUTES
 #include "third_party/boost/allowed/sort/sort.hpp"
 #endif
 #if HAVE_SORT512
-#include "sort512.h"
+#include "sort512.h"  //NOLINT
 #endif
 
+// vxsort is difficult to compile for multiple targets because it also uses
+// .cpp files, and we'd also have to #undef its include guards. Instead, compile
+// only for AVX2 or AVX3 depending on this macro.
+#define VXSORT_AVX3 1
+#if HAVE_VXSORT
+// inlined from vxsort_targets_enable_avx512 (must close before end of header)
+#ifdef __GNUC__
+#ifdef __clang__
+#if VXSORT_AVX3
+#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
+                             apply_to = any(function))
+#else
+#pragma clang attribute push(__attribute__((target("avx2"))), \
+                             apply_to = any(function))
+#endif  // VXSORT_AVX3
+
+#else
+#pragma GCC push_options
+#if VXSORT_AVX3
+#pragma GCC target("avx512f,avx512dq")
+#else
+#pragma GCC target("avx2")
+#endif  // VXSORT_AVX3
+#endif
+#endif
+
+#if VXSORT_AVX3
+#include "vxsort/machine_traits.avx512.h"
+#else
+#include "vxsort/machine_traits.avx2.h"
+#endif  // VXSORT_AVX3
+#include "vxsort/vxsort.h"
+#ifdef __GNUC__
+#ifdef __clang__
+#pragma clang attribute pop
+#else
+#pragma GCC pop_options
+#endif
+#endif
+#endif  // HAVE_VXSORT
+
 namespace hwy {
 
 enum class Dist { kUniform8, kUniform16, kUniform32 };
 
-std::vector<Dist> AllDist() {
+static inline std::vector<Dist> AllDist() {
   return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32};
 }
 
-const char* DistName(Dist dist) {
+static inline const char* DistName(Dist dist) {
   switch (dist) {
     case Dist::kUniform8:
       return "uniform8";
@@ -94,13 +136,16 @@ class InputStats {
     }
 
     if (min_ != other.min_ || max_ != other.max_) {
-      HWY_ABORT("minmax %f/%f vs %f/%f\n", double(min_), double(max_),
-                double(other.min_), double(other.max_));
+      HWY_ABORT("minmax %f/%f vs %f/%f\n", static_cast<double>(min_),
+                static_cast<double>(max_), static_cast<double>(other.min_),
+                static_cast<double>(other.max_));
     }
 
     // Sum helps detect duplicated/lost values
     if (sum_ != other.sum_) {
-      HWY_ABORT("Sum mismatch; min %g max %g\n", double(min_), double(max_));
+      HWY_ABORT("Sum mismatch %g %g; min %g max %g\n",
+                static_cast<double>(sum_), static_cast<double>(other.sum_),
+                static_cast<double>(min_), static_cast<double>(max_));
     }
 
     return true;
@@ -129,12 +174,15 @@ enum class Algo {
 #if HAVE_SORT512
   kSort512,
 #endif
+#if HAVE_VXSORT
+  kVXSort,
+#endif
   kStd,
   kVQSort,
   kHeap,
 };
 
-const char* AlgoName(Algo algo) {
+static inline const char* AlgoName(Algo algo) {
   switch (algo) {
 #if HAVE_AVX2SORT
     case Algo::kSEA:
@@ -156,6 +204,10 @@ const char* AlgoName(Algo algo) {
     case Algo::kSort512:
       return "sort512";
 #endif
+#if HAVE_VXSORT
+    case Algo::kVXSort:
+      return "vxsort";
+#endif
     case Algo::kStd:
       return "std";
     case Algo::kVQSort:
@@ -284,7 +336,7 @@ InputStats<T> GenerateInput(const Dist d
   size_t i = 0;
   for (; i + N <= num; i += N) {
     const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
-#if HWY_ARCH_RVV
+#if HWY_ARCH_RVV || (HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7)
     // v may not be 64-bit aligned
     StoreU(bits, du64, buf.get());
     memcpy(v + i, buf.get(), N64 * sizeof(uint64_t));
@@ -318,12 +370,58 @@ struct SharedState {
   std::vector<ThreadLocal> tls{1};
 };
 
-template <class Order, typename T>
-void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
-         size_t thread) {
-  using detail::HeapSort;
+// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For
+// non-128-bit keys they are the same:
+template <class Order, typename KeyType, HWY_IF_NOT_LANE_SIZE(KeyType, 16)>
+void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) {
   using detail::TraitsLane;
   using detail::SharedTraits;
+  if (Order().IsAscending()) {
+    const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
+    return detail::HeapSort(st, keys, num_keys);
+  } else {
+    const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
+    return detail::HeapSort(st, keys, num_keys);
+  }
+}
+
+#if VQSORT_ENABLED
+template <class Order>
+void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) {
+  using detail::SharedTraits;
+  using detail::Traits128;
+  uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
+  const size_t num_lanes = num_keys * 2;
+  if (Order().IsAscending()) {
+    const SharedTraits<Traits128<detail::OrderAscending128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  } else {
+    const SharedTraits<Traits128<detail::OrderDescending128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  }
+}
+
+template <class Order>
+void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) {
+  using detail::SharedTraits;
+  using detail::Traits128;
+  uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
+  const size_t num_lanes = num_keys * 2;
+  if (Order().IsAscending()) {
+    const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  } else {
+    const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  }
+}
+#endif  // VQSORT_ENABLED
+
+template <class Order, typename KeyType>
+void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num,
+         SharedState& shared, size_t thread) {
+  const std::less<KeyType> less;
+  const std::greater<KeyType> greater;
 
   switch (algo) {
 #if HAVE_AVX2SORT
@@ -334,20 +432,18 @@ void Run(Algo algo, T* HWY_RESTRICT inou
 #if HAVE_IPS4O
     case Algo::kIPS4O:
       if (Order().IsAscending()) {
-        return ips4o::sort(inout, inout + num, std::less<T>());
+        return ips4o::sort(inout, inout + num, less);
       } else {
-        return ips4o::sort(inout, inout + num, std::greater<T>());
+        return ips4o::sort(inout, inout + num, greater);
       }
 #endif
 
 #if HAVE_PARALLEL_IPS4O
     case Algo::kParallelIPS4O:
       if (Order().IsAscending()) {
-        return ips4o::parallel::sort(inout, inout + num, std::less<T>(),
-                                     shared.pool);
+        return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
       } else {
-        return ips4o::parallel::sort(inout, inout + num, std::greater<T>(),
-                                     shared.pool);
+        return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
       }
 #endif
 
@@ -360,33 +456,47 @@ void Run(Algo algo, T* HWY_RESTRICT inou
 #if HAVE_PDQSORT
     case Algo::kPDQ:
       if (Order().IsAscending()) {
-        return boost::sort::pdqsort_branchless(inout, inout + num,
-                                               std::less<T>());
+        return boost::sort::pdqsort_branchless(inout, inout + num, less);
       } else {
-        return boost::sort::pdqsort_branchless(inout, inout + num,
-                                               std::greater<T>());
+        return boost::sort::pdqsort_branchless(inout, inout + num, greater);
       }
 #endif
 
+#if HAVE_VXSORT
+    case Algo::kVXSort: {
+#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
+    (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
+      fprintf(stderr, "Do not call for target %s\n",
+              hwy::TargetName(HWY_TARGET));
+      return;
+#else
+#if VXSORT_AVX3
+      vxsort::vxsort<KeyType, vxsort::AVX512> vx;
+#else
+      vxsort::vxsort<KeyType, vxsort::AVX2> vx;
+#endif
+      if (Order().IsAscending()) {
+        return vx.sort(inout, inout + num - 1);
+      } else {
+        fprintf(stderr, "Skipping VX - does not support descending order\n");
+        return;
+      }
+#endif  // enabled for this target
+    }
+#endif  // HAVE_VXSORT
+
     case Algo::kStd:
       if (Order().IsAscending()) {
-        return std::sort(inout, inout + num, std::less<T>());
+        return std::sort(inout, inout + num, less);
       } else {
-        return std::sort(inout, inout + num, std::greater<T>());
+        return std::sort(inout, inout + num, greater);
       }
 
     case Algo::kVQSort:
       return shared.tls[thread].sorter(inout, num, Order());
 
     case Algo::kHeap:
-      HWY_ASSERT(sizeof(T) < 16);
-      if (Order().IsAscending()) {
-        const SharedTraits<TraitsLane<detail::OrderAscending>> st;
-        return HeapSort(st, inout, num);
-      } else {
-        const SharedTraits<TraitsLane<detail::OrderDescending>> st;
-        return HeapSort(st, inout, num);
-      }
+      return CallHeapSort<Order>(inout, num);
 
     default:
       HWY_ABORT("Not implemented");
diff -pruN 0.17.0-11/hwy/contrib/sort/bench_parallel.cc 1.0.0-2/hwy/contrib/sort/bench_parallel.cc
--- 0.17.0-11/hwy/contrib/sort/bench_parallel.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/bench_parallel.cc	2022-07-27 11:48:16.000000000 +0000
@@ -28,10 +28,9 @@
 #include <vector>
 
 // clang-format off
-#include "hwy/contrib/sort/vqsort.h"
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc"  //NOLINT
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/algo-inl.h"
@@ -46,8 +45,6 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace {
 
-#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128
-
 class ThreadPool {
  public:
   // Starts the given number of worker threads and blocks until they are ready.
@@ -169,16 +166,21 @@ class ThreadPool {
   const void* data_;                               // points to caller's Func
 };
 
-template <class Order, typename T>
-void RunWithoutVerify(const Dist dist, const size_t num, const Algo algo,
-                      SharedState& shared, size_t thread) {
-  auto aligned = hwy::AllocateAligned<T>(num);
+template <class Traits>
+void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys,
+                      const Algo algo, SharedState& shared, size_t thread) {
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  using Order = typename Traits::Order;
+  const size_t num_lanes = num_keys * st.LanesPerKey();
+  auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
 
-  (void)GenerateInput(dist, aligned.get(), num);
+  (void)GenerateInput(dist, aligned.get(), num_lanes);
 
   const Timestamp t0;
-  Run<Order>(algo, aligned.get(), num, shared, thread);
-  HWY_ASSERT(aligned[0] < aligned[num - 1]);
+  Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, shared,
+             thread);
+  HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]);
 }
 
 void BenchParallel() {
@@ -190,17 +192,16 @@ void BenchParallel() {
   ThreadPool pool;
   const size_t NT = pool.NumThreads();
 
-  using T = int64_t;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
-
-  size_t num = 100 * 1000 * 1000;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
+  using KeyType = typename decltype(st)::KeyType;
+  const size_t num_keys = size_t{100} * 1000 * 1000;
 
 #if HAVE_IPS4O
   const Algo algo = Algo::kIPS4O;
 #else
   const Algo algo = Algo::kVQSort;
 #endif
-  const Dist dist = Dist::kUniform16;
+  const Dist dist = Dist::kUniform32;
 
   SharedState shared;
   shared.tls.resize(NT);
@@ -210,18 +211,15 @@ void BenchParallel() {
     Timestamp t0;
     // Default capture because MSVC wants algo/dist but clang does not.
     pool.RunOnThreads(nt, [=, &shared](size_t thread) {
-      RunWithoutVerify<SortAscending, T>(dist, num, algo, shared, thread);
+      RunWithoutVerify(st, dist, num_keys, algo, shared, thread);
     });
     const double sec = SecondsSince(t0);
-    results.push_back(MakeResult<T>(algo, dist, st, num, nt, sec));
+    results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType),
+                         st.KeyString());
     results.back().Print();
   }
 }
 
-#else
-void BenchParallel() {}
-#endif
-
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
diff -pruN 0.17.0-11/hwy/contrib/sort/bench_sort.cc 1.0.0-2/hwy/contrib/sort/bench_sort.cc
--- 0.17.0-11/hwy/contrib/sort/bench_sort.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/bench_sort.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,31 +13,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>  // memcpy
+
+#include <vector>
+
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/algo-inl.h"
 #include "hwy/contrib/sort/result-inl.h"
-#include "hwy/contrib/sort/vqsort.h"
 #include "hwy/contrib/sort/sorting_networks-inl.h"  // SharedTraits
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/traits128-inl.h"
 #include "hwy/tests/test_util-inl.h"
 // clang-format on
 
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>  // memcpy
-
-#include <vector>
+// Mode for larger sorts because M1 is able to access more than the per-core
+// share of L2, so 1M elements might still be in cache.
+#define SORT_100M 0
 
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 // Defined within HWY_ONCE, used by BenchAllSort.
-extern uint32_t first_sort_target;
+extern int64_t first_sort_target;
 
 namespace HWY_NAMESPACE {
 namespace {
@@ -46,41 +49,51 @@ using detail::OrderAscending;
 using detail::OrderDescending;
 using detail::SharedTraits;
 
-#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128
+#if VQSORT_ENABLED || HWY_IDE
 using detail::OrderAscending128;
-using detail::OrderDescending128;
 using detail::Traits128;
 
-template <class Traits, typename T>
+template <class Traits>
 HWY_NOINLINE void BenchPartition() {
-  const SortTag<T> d;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const SortTag<LaneType> d;
   detail::SharedTraits<Traits> st;
   const Dist dist = Dist::kUniform8;
   double sum = 0.0;
 
+  detail::Generator rng(&sum, 123);  // for ChoosePivot
+
   const size_t max_log2 = AdjustedLog2Reps(20);
   for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
-    const size_t num = 1ull << log2;
-    auto aligned = hwy::AllocateAligned<T>(num);
-    auto buf =
-        hwy::AllocateAligned<T>(hwy::SortConstants::PartitionBufNum(Lanes(d)));
+    const size_t num_lanes = 1ull << log2;
+    const size_t num_keys = num_lanes / st.LanesPerKey();
+    auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+    auto buf = hwy::AllocateAligned<LaneType>(
+        HWY_MAX(hwy::SortConstants::PartitionBufNum(Lanes(d)),
+                hwy::SortConstants::PivotBufNum(sizeof(LaneType), Lanes(d))));
 
     std::vector<double> seconds;
-    const size_t num_reps = (1ull << (14 - log2 / 2)) * kReps;
+    const size_t num_reps = (1ull << (14 - log2 / 2)) * 30;
     for (size_t rep = 0; rep < num_reps; ++rep) {
-      (void)GenerateInput(dist, aligned.get(), num);
+      (void)GenerateInput(dist, aligned.get(), num_lanes);
 
-      const Timestamp t0;
+      // The pivot value can influence performance. Do exactly what vqsort will
+      // do so that the performance (influenced by prefetching and branch
+      // prediction) is likely to predict the actual performance inside vqsort.
+      const auto pivot = detail::ChoosePivot(d, st, aligned.get(), 0, num_lanes,
+                                             buf.get(), rng);
 
-      detail::Partition(d, st, aligned.get(), 0, num - 1, Set(d, T(128)),
+      const Timestamp t0;
+      detail::Partition(d, st, aligned.get(), 0, num_lanes - 1, pivot,
                         buf.get());
       seconds.push_back(SecondsSince(t0));
       // 'Use' the result to prevent optimizing out the partition.
-      sum += static_cast<double>(aligned.get()[num / 2]);
+      sum += static_cast<double>(aligned.get()[num_lanes / 2]);
     }
 
-    MakeResult<T>(Algo::kVQSort, dist, st, num, 1,
-                  SummarizeMeasurements(seconds))
+    Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds),
+           sizeof(KeyType), st.KeyString())
         .Print();
   }
   HWY_ASSERT(sum != 999999);  // Prevent optimizing out
@@ -88,51 +101,60 @@ HWY_NOINLINE void BenchPartition() {
 
 HWY_NOINLINE void BenchAllPartition() {
   // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
+  if (HWY_TARGET == HWY_SSSE3) {
     return;
   }
 
-  BenchPartition<TraitsLane<OrderDescending>, float>();
-  BenchPartition<TraitsLane<OrderAscending>, int64_t>();
-  BenchPartition<Traits128<OrderDescending128>, uint64_t>();
+  BenchPartition<TraitsLane<OrderDescending<float>>>();
+  BenchPartition<TraitsLane<OrderDescending<int32_t>>>();
+  BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
+  BenchPartition<Traits128<OrderAscending128>>();
+  // BenchPartition<Traits128<OrderDescending128>>();
+  // BenchPartition<Traits128<OrderAscendingKV128>>();
 }
 
-template <class Traits, typename T>
+template <class Traits>
 HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
   // Not interested in benchmark results for these targets
   if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
     return;
   }
 
-  const SortTag<T> d;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const SortTag<LaneType> d;
   detail::SharedTraits<Traits> st;
   const Dist dist = Dist::kUniform32;
 
   const size_t N = Lanes(d);
-  const size_t num = SortConstants::BaseCaseNum(N);
-  auto keys = hwy::AllocateAligned<T>(num);
-  auto buf = hwy::AllocateAligned<T>(num + N);
+  const size_t num_lanes = SortConstants::BaseCaseNum(N);
+  const size_t num_keys = num_lanes / st.LanesPerKey();
+  auto keys = hwy::AllocateAligned<LaneType>(num_lanes);
+  auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N);
 
   std::vector<double> seconds;
   double sum = 0;                             // prevents elision
   constexpr size_t kMul = AdjustedReps(600);  // ensures long enough to measure
 
-  for (size_t rep = 0; rep < kReps; ++rep) {
-    InputStats<T> input_stats = GenerateInput(dist, keys.get(), num);
+  for (size_t rep = 0; rep < 30; ++rep) {
+    InputStats<LaneType> input_stats =
+        GenerateInput(dist, keys.get(), num_lanes);
 
     const Timestamp t0;
     for (size_t i = 0; i < kMul; ++i) {
-      detail::BaseCase(d, st, keys.get(), num, buf.get());
+      detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes,
+                       buf.get());
       sum += static_cast<double>(keys[0]);
     }
     seconds.push_back(SecondsSince(t0));
     // printf("%f\n", seconds.back());
 
-    HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num, "BenchBase"));
+    HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase"));
   }
   HWY_ASSERT(sum < 1E99);
-  results.push_back(MakeResult<T>(Algo::kVQSort, dist, st, num * kMul, 1,
-                                  SummarizeMeasurements(seconds)));
+  results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1,
+                       SummarizeMeasurements(seconds), sizeof(KeyType),
+                       st.KeyString());
 }
 
 HWY_NOINLINE void BenchAllBase() {
@@ -142,14 +164,19 @@ HWY_NOINLINE void BenchAllBase() {
   }
 
   std::vector<Result> results;
-  BenchBase<TraitsLane<OrderAscending>, float>(results);
-  BenchBase<TraitsLane<OrderDescending>, int64_t>(results);
-  BenchBase<Traits128<OrderAscending128>, uint64_t>(results);
+  BenchBase<TraitsLane<OrderAscending<float>>>(results);
+  BenchBase<TraitsLane<OrderDescending<int64_t>>>(results);
+  BenchBase<Traits128<OrderAscending128>>(results);
   for (const Result& r : results) {
     r.Print();
   }
 }
 
+#else
+void BenchAllPartition() {}
+void BenchAllBase() {}
+#endif  // VQSORT_ENABLED
+
 std::vector<Algo> AlgoForBench() {
   return {
 #if HAVE_AVX2SORT
@@ -166,45 +193,64 @@ std::vector<Algo> AlgoForBench() {
 #if HAVE_SORT512
         Algo::kSort512,
 #endif
+// Only include if we're compiling for the target it supports.
+#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \
+                    (!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2))
+        Algo::kVXSort,
+#endif
 
-// These are 10-20x slower, but that's OK for the default size when we are
-// not testing the parallel mode.
 #if !HAVE_PARALLEL_IPS4O
+#if !SORT_100M
+        // These are 10-20x slower, but that's OK for the default size when we
+        // are not testing the parallel nor 100M modes.
         Algo::kStd, Algo::kHeap,
+#endif
 
         Algo::kVQSort,  // only ~4x slower, but not required for Table 1a
 #endif
-
   };
 }
 
-template <class Traits, typename T>
-HWY_NOINLINE void BenchSort(size_t num) {
+template <class Traits>
+HWY_NOINLINE void BenchSort(size_t num_keys) {
   if (first_sort_target == 0) first_sort_target = HWY_TARGET;
 
   SharedState shared;
   detail::SharedTraits<Traits> st;
-  auto aligned = hwy::AllocateAligned<T>(num);
+  using Order = typename Traits::Order;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const size_t num_lanes = num_keys * st.LanesPerKey();
+  auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+
+  const size_t reps = num_keys > 1000 * 1000 ? 10 : 30;
+
   for (Algo algo : AlgoForBench()) {
     // Other algorithms don't depend on the vector instructions, so only run
     // them for the first target.
-    if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) continue;
+#if !HAVE_VXSORT
+    if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) {
+      continue;
+    }
+#endif
 
     for (Dist dist : AllDist()) {
       std::vector<double> seconds;
-      for (size_t rep = 0; rep < kReps; ++rep) {
-        InputStats<T> input_stats = GenerateInput(dist, aligned.get(), num);
+      for (size_t rep = 0; rep < reps; ++rep) {
+        InputStats<LaneType> input_stats =
+            GenerateInput(dist, aligned.get(), num_lanes);
 
         const Timestamp t0;
-        Run<typename Traits::Order>(algo, aligned.get(), num, shared,
-                                    /*thread=*/0);
+        Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys,
+                   shared, /*thread=*/0);
         seconds.push_back(SecondsSince(t0));
         // printf("%f\n", seconds.back());
 
         HWY_ASSERT(
-            VerifySort(st, input_stats, aligned.get(), num, "BenchSort"));
+            VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort"));
       }
-      MakeResult<T>(algo, dist, st, num, 1, SummarizeMeasurements(seconds))
+      Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds),
+             sizeof(KeyType), st.KeyString())
           .Print();
     }  // dist
   }    // algo
@@ -212,40 +258,40 @@ HWY_NOINLINE void BenchSort(size_t num)
 
 HWY_NOINLINE void BenchAllSort() {
   // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
+  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 ||
+      HWY_TARGET == HWY_EMU128) {
     return;
   }
+  // Only enable EMU128 on x86 - it's slow on emulators.
+  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
 
   constexpr size_t K = 1000;
   constexpr size_t M = K * K;
   (void)K;
   (void)M;
-  for (size_t num : {
-#if HAVE_PARALLEL_IPS4O
+  for (size_t num_keys : {
+#if HAVE_PARALLEL_IPS4O || SORT_100M
          100 * M,
 #else
-         AdjustedReps(1 * M),
+        1 * M,
 #endif
        }) {
-    BenchSort<TraitsLane<OrderAscending>, float>(num);
-    // BenchSort<TraitsLane<OrderDescending>, double>(num);
-    // BenchSort<TraitsLane<OrderAscending>, int16_t>(num);
-    BenchSort<TraitsLane<OrderDescending>, int32_t>(num);
-    BenchSort<TraitsLane<OrderAscending>, int64_t>(num);
-    // BenchSort<TraitsLane<OrderDescending>, uint16_t>(num);
-    // BenchSort<TraitsLane<OrderDescending>, uint32_t>(num);
-    // BenchSort<TraitsLane<OrderAscending>, uint64_t>(num);
-
-    BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
+    BenchSort<TraitsLane<OrderAscending<float>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<double>>>(num_keys);
+    // BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys);
+    BenchSort<TraitsLane<OrderDescending<int32_t>>>(num_keys);
+    BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<uint16_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<uint32_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys);
+
+#if !HAVE_VXSORT && VQSORT_ENABLED
+    BenchSort<Traits128<OrderAscending128>>(num_keys);
+    // BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
+#endif
   }
 }
 
-#else
-void BenchAllPartition() {}
-void BenchAllBase() {}
-void BenchAllSort() {}
-#endif
-
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@@ -255,7 +301,7 @@ HWY_AFTER_NAMESPACE();
 #if HWY_ONCE
 
 namespace hwy {
-uint32_t first_sort_target = 0;  // none run yet
+int64_t first_sort_target = 0;  // none run yet
 namespace {
 HWY_BEFORE_TEST(BenchSort);
 HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
diff -pruN 0.17.0-11/hwy/contrib/sort/BUILD 1.0.0-2/hwy/contrib/sort/BUILD
--- 0.17.0-11/hwy/contrib/sort/BUILD	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/BUILD	2022-07-27 11:48:16.000000000 +0000
@@ -8,32 +8,87 @@ COMPAT = [
     "//buildenv/target:non_prod",  # includes mobile/vendor.
 ]
 
+# cc_library(
+#     name = "vxsort",
+#     srcs = [
+#         "vxsort/isa_detection.cpp",
+#         "vxsort/isa_detection_msvc.cpp",
+#         "vxsort/isa_detection_sane.cpp",
+#         "vxsort/machine_traits.avx2.cpp",
+#         "vxsort/smallsort/avx2_load_mask_tables.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp",
+#         "vxsort/vxsort_stats.cpp",
+#     ],
+#     hdrs = [
+#         "vxsort/alignment.h",
+#         "vxsort/defs.h",
+#         "vxsort/isa_detection.h",
+#         "vxsort/machine_traits.avx2.h",
+#         "vxsort/machine_traits.avx512.h",
+#         "vxsort/machine_traits.h",
+#         "vxsort/packer.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.h",
+#         "vxsort/vxsort.h",
+#         "vxsort/vxsort_stats.h",
+#     ],
+#     compatible_with = [],
+#     textual_hdrs = [
+#         "vxsort/vxsort_targets_disable.h",
+#         "vxsort/vxsort_targets_enable_avx2.h",
+#         "vxsort/vxsort_targets_enable_avx512.h",
+#     ],
+# )
+
 cc_library(
     name = "vqsort",
     srcs = [
         # Split into separate files to reduce MSVC build time.
         "vqsort.cc",
-        "vqsort_i16a.cc",
-        "vqsort_i16d.cc",
-        "vqsort_u16a.cc",
-        "vqsort_u16d.cc",
+        "vqsort_128a.cc",
+        "vqsort_128d.cc",
         "vqsort_f32a.cc",
         "vqsort_f32d.cc",
-        "vqsort_i32a.cc",
-        "vqsort_i32d.cc",
-        "vqsort_u32a.cc",
-        "vqsort_u32d.cc",
         "vqsort_f64a.cc",
         "vqsort_f64d.cc",
+        "vqsort_i16a.cc",
+        "vqsort_i16d.cc",
+        "vqsort_i32a.cc",
+        "vqsort_i32d.cc",
         "vqsort_i64a.cc",
         "vqsort_i64d.cc",
+        "vqsort_kv128a.cc",
+        "vqsort_kv128d.cc",
+        "vqsort_u16a.cc",
+        "vqsort_u16d.cc",
+        "vqsort_u32a.cc",
+        "vqsort_u32d.cc",
         "vqsort_u64a.cc",
         "vqsort_u64d.cc",
-        "vqsort_128a.cc",
-        "vqsort_128d.cc",
     ],
     hdrs = [
-        "disabled_targets.h",
         "vqsort.h",  # public interface
     ],
     compatible_with = [],
@@ -49,6 +104,7 @@ cc_library(
         # Only if VQSORT_SECURE_RNG is set.
         # "//third_party/absl/random",
         "//:hwy",
+        # ":vxsort",  # required if HAVE_VXSORT
     ],
 )
 
@@ -87,8 +143,7 @@ cc_test(
     name = "sort_test",
     size = "medium",
     srcs = ["sort_test.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
+    # Do not enable fully_static_link (pthread crash on bazel)
     local_defines = ["HWY_IS_TEST"],
     # for test_suite.
     tags = ["hwy_ops_test"],
@@ -105,8 +160,7 @@ cc_binary(
     name = "bench_sort",
     testonly = 1,
     srcs = ["bench_sort.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
+    # Do not enable fully_static_link (pthread crash on bazel)
     local_defines = ["HWY_IS_TEST"],
     deps = [
         ":helpers",
@@ -121,8 +175,7 @@ cc_binary(
     name = "bench_parallel",
     testonly = 1,
     srcs = ["bench_parallel.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
+    # Do not enable fully_static_link (pthread crash on bazel)
     local_defines = ["HWY_IS_TEST"],
     deps = [
         ":helpers",
diff -pruN 0.17.0-11/hwy/contrib/sort/disabled_targets.h 1.0.0-2/hwy/contrib/sort/disabled_targets.h
--- 0.17.0-11/hwy/contrib/sort/disabled_targets.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/disabled_targets.h	1970-01-01 00:00:00.000000000 +0000
@@ -1,31 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Speed up MSVC builds by building fewer targets. This header must be included
-// from all TUs that contain a HWY_DYNAMIC_DISPATCH to vqsort, i.e. vqsort_*.cc.
-// However, users of vqsort.h are unaffected.
-
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
-
-#include "hwy/base.h"
-
-#if HWY_COMPILER_MSVC
-#undef HWY_DISABLED_TARGETS
-// Either HWY_SCALAR/HWY_EMU128 remains, so we still have a valid target.
-#define HWY_DISABLED_TARGETS (HWY_SSSE3 | HWY_SSE4)
-#endif  // HWY_COMPILER_MSVC
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
diff -pruN 0.17.0-11/hwy/contrib/sort/README.md 1.0.0-2/hwy/contrib/sort/README.md
--- 0.17.0-11/hwy/contrib/sort/README.md	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/README.md	2022-07-27 11:48:16.000000000 +0000
@@ -0,0 +1,81 @@
+# Vectorized and performance-portable Quicksort
+
+## Introduction
+
+As of 2022-06-07 this sorts large arrays of built-in types about ten times as
+fast as `std::sort`. See also our
+[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html)
+and [paper](https://arxiv.org/abs/2205.05982).
+
+## Instructions
+
+Here are instructions for reproducing our results on x86 Linux (AVX2, AVX-512)
+and Arm V1 (NEON, SVE).
+
+### x86 (Linux)
+
+Please first ensure golang, and Clang (tested with 13.0.1) are installed via
+your system's package manager.
+
+```
+go install github.com/bazelbuild/bazelisk@latest
+git clone https://github.com/google/highway
+cd highway
+CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all
+bazel-bin/hwy/contrib/sort/sort_test
+bazel-bin/hwy/contrib/sort/bench_sort
+```
+
+### AWS Graviton3
+
+Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is
+32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the
+config is verified, then re-launch. See IPv4 hostname in list of instances.
+
+`ssh -i /path/key.pem ec2-user@hostname`
+
+Note that the AWS CMake package is too old for llvm, so we build it first:
+```
+wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz
+tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/
+./bootstrap -- -DCMAKE_USE_OPENSSL=OFF
+make -j8 && sudo make install
+cd ..
+```
+
+AWS clang is at version 11.1, which generates unnecessary AND instructions which
+slow down the sort by 1.15x. We tested with clang trunk as of June 13
+(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
+```
+git clone --depth 1 https://github.com/llvm/llvm-project.git
+cd llvm-project
+mkdir -p build && cd build
+/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release
+make -j32 && sudo make install
+```
+
+```
+sudo yum install go
+go install github.com/bazelbuild/bazelisk@latest
+git clone https://github.com/google/highway
+cd highway
+CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all
+bazel-bin/hwy/contrib/sort/sort_test
+bazel-bin/hwy/contrib/sort/bench_sort
+```
+
+## Results
+
+`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
+algorithm (std for `std::sort`, vq for our vqsort), the type of keys being
+sorted (f32 is float), the distribution of keys (uniform32 for uniform random
+with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e.
+number of key bytes output per second).
+
+Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz:
+
+```
+[ RUN      ] BenchSortGroup/BenchSort.BenchAllSort/AVX3
+      AVX3:          std:     f32: uniform32: 1.00E+06   54 MB/s ( 1 threads)
+      AVX3:           vq:     f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads)
+```
diff -pruN 0.17.0-11/hwy/contrib/sort/result-inl.h 1.0.0-2/hwy/contrib/sort/result-inl.h
--- 0.17.0-11/hwy/contrib/sort/result-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/result-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -34,20 +34,19 @@ struct Timestamp {
   double t;
 };
 
-double SecondsSince(const Timestamp& t0) {
+static inline double SecondsSince(const Timestamp& t0) {
   const Timestamp t1;
   return t1.t - t0.t;
 }
 
-constexpr size_t kReps = 30;
-
 // Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
 // enough for the mode to be reliable).
-double SummarizeMeasurements(std::vector<double>& seconds) {
+static inline double SummarizeMeasurements(std::vector<double>& seconds) {
   std::sort(seconds.begin(), seconds.end());
   double sum = 0;
   int count = 0;
-  for (size_t i = kReps / 4; i < seconds.size() - kReps / 2; ++i) {
+  const size_t num = seconds.size();
+  for (size_t i = num / 4; i < num / 2; ++i) {
     sum += seconds[i];
     count += 1;
   }
@@ -72,72 +71,62 @@ namespace HWY_NAMESPACE {
 
 struct Result {
   Result() {}
-  Result(const uint32_t target, const Algo algo, Dist dist, bool is128,
-         size_t num, size_t num_threads, double sec, size_t sizeof_t,
-         const char* type_name)
-      : target(target),
+  Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads,
+         double sec, size_t sizeof_key, const std::string& key_name)
+      : target(HWY_TARGET),
         algo(algo),
         dist(dist),
-        is128(is128),
-        num(num),
+        num_keys(num_keys),
         num_threads(num_threads),
         sec(sec),
-        sizeof_t(sizeof_t),
-        type_name(type_name) {}
+        sizeof_key(sizeof_key),
+        key_name(key_name) {}
 
   void Print() const {
-    const double bytes = static_cast<double>(num) *
+    const double bytes = static_cast<double>(num_keys) *
                          static_cast<double>(num_threads) *
-                         static_cast<double>(sizeof_t);
+                         static_cast<double>(sizeof_key);
     printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
-           hwy::TargetName(target), AlgoName(algo),
-           is128 ? "u128" : type_name.c_str(), DistName(dist),
-           static_cast<double>(num), bytes * 1E-6 / sec, num_threads);
+           hwy::TargetName(target), AlgoName(algo), key_name.c_str(),
+           DistName(dist), static_cast<double>(num_keys), bytes * 1E-6 / sec,
+           num_threads);
   }
 
-  uint32_t target;
+  int64_t target;
   Algo algo;
   Dist dist;
-  bool is128;
-  size_t num = 0;
+  size_t num_keys = 0;
   size_t num_threads = 0;
   double sec = 0.0;
-  size_t sizeof_t = 0;
-  std::string type_name;
+  size_t sizeof_key = 0;
+  std::string key_name;
 };
 
-template <typename T, class Traits>
-Result MakeResult(const Algo algo, Dist dist, Traits st, size_t num,
-                  size_t num_threads, double sec) {
-  char string100[100];
-  hwy::detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, string100);
-  return Result(HWY_TARGET, algo, dist, st.Is128(), num, num_threads, sec,
-                sizeof(T), string100);
-}
-
-template <class Traits, typename T>
-bool VerifySort(Traits st, const InputStats<T>& input_stats, const T* out,
-                size_t num, const char* caller) {
-  constexpr size_t N1 = st.Is128() ? 2 : 1;
-  HWY_ASSERT(num >= N1);
+template <class Traits, typename LaneType>
+bool VerifySort(Traits st, const InputStats<LaneType>& input_stats,
+                const LaneType* out, size_t num_lanes, const char* caller) {
+  constexpr size_t N1 = st.LanesPerKey();
+  HWY_ASSERT(num_lanes >= N1);
 
-  InputStats<T> output_stats;
+  InputStats<LaneType> output_stats;
   // Ensure it matches the sort order
-  for (size_t i = 0; i < num - N1; i += N1) {
+  for (size_t i = 0; i < num_lanes - N1; i += N1) {
     output_stats.Notify(out[i]);
     if (N1 == 2) output_stats.Notify(out[i + 1]);
     // Reverse order instead of checking !Compare1 so we accept equal keys.
     if (st.Compare1(out + i + N1, out + i)) {
-      printf("%s: i=%d of %d: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n", caller,
-             static_cast<int>(i), static_cast<int>(num), static_cast<int>(N1),
-             double(out[i + 1]), double(out[i + 0]), double(out[i + N1 + 1]),
-             double(out[i + N1]));
+      printf("%s: i=%d of %d lanes: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n",
+             caller, static_cast<int>(i), static_cast<int>(num_lanes),
+             static_cast<int>(N1), static_cast<double>(out[i + 1]),
+             static_cast<double>(out[i + 0]),
+             static_cast<double>(out[i + N1 + 1]),
+             static_cast<double>(out[i + N1]));
       HWY_ABORT("%d-bit sort is incorrect\n",
-                static_cast<int>(sizeof(T) * 8 * N1));
+                static_cast<int>(sizeof(LaneType) * 8 * N1));
     }
   }
-  output_stats.Notify(out[num - N1]);
-  if (N1 == 2) output_stats.Notify(out[num - N1 + 1]);
+  output_stats.Notify(out[num_lanes - N1]);
+  if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]);
 
   return input_stats == output_stats;
 }
diff -pruN 0.17.0-11/hwy/contrib/sort/shared-inl.h 1.0.0-2/hwy/contrib/sort/shared-inl.h
--- 0.17.0-11/hwy/contrib/sort/shared-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/shared-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -28,8 +28,8 @@ namespace hwy {
 struct SortConstants {
 // SortingNetwork reshapes its input into a matrix. This is the maximum number
 // of *keys* per vector.
-#if HWY_COMPILER_MSVC
-  static constexpr size_t kMaxCols = 8;  // avoids build timeout
+#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
+  static constexpr size_t kMaxCols = 8;  // avoid build timeout/stack overflow
 #else
   static constexpr size_t kMaxCols = 16;  // enough for u32 in 512-bit vector
 #endif
@@ -102,6 +102,17 @@ struct SortConstants {
 
 #include "hwy/highway.h"
 
+// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
+// Arm v7 debug.
+#undef VQSORT_ENABLED
+#if (HWY_TARGET == HWY_SCALAR) ||                 \
+    (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
+    (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
+#define VQSORT_ENABLED 0
+#else
+#define VQSORT_ENABLED 1
+#endif
+
 namespace hwy {
 namespace HWY_NAMESPACE {
 
diff -pruN 0.17.0-11/hwy/contrib/sort/sorting_networks-inl.h 1.0.0-2/hwy/contrib/sort/sorting_networks-inl.h
--- 0.17.0-11/hwy/contrib/sort/sorting_networks-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/sorting_networks-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -22,7 +22,6 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
 #endif
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/shared-inl.h"  // SortConstants
 #include "hwy/highway.h"
 
@@ -31,6 +30,8 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {
 
+#if VQSORT_ENABLED
+
 using Constants = hwy::SortConstants;
 
 // ------------------------------ SharedTraits
@@ -595,12 +596,14 @@ HWY_INLINE void Merge16(D d, Traits st,
 // `buf` ensures full vectors are aligned, and enables loads/stores without
 //   bounds checks.
 //
+// NOINLINE because this is large and called twice from vqsort-inl.h.
+//
 // References:
 // https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
 // https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
 // "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
 template <class Traits, typename T>
-HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
+HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
   const CappedTag<T, Constants::kMaxCols> d;
   using V = decltype(Zero(d));
 
@@ -647,8 +650,8 @@ HWY_INLINE void SortingNetwork(Traits st
         Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
                ve, vf);
 
-        // Avoids build timeout
-#if !HWY_COMPILER_MSVC
+        // Avoids build timeout. Must match #if condition in kMaxCols.
+#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
         if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
           Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
                   ve, vf);
@@ -678,6 +681,11 @@ HWY_INLINE void SortingNetwork(Traits st
   StoreU(vf, d, buf + 0xf * cols);
 }
 
+#else
+template <class Base>
+struct SharedTraits : public Base {};
+#endif  // VQSORT_ENABLED
+
 }  // namespace detail
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
diff -pruN 0.17.0-11/hwy/contrib/sort/sort_test.cc 1.0.0-2/hwy/contrib/sort/sort_test.cc
--- 0.17.0-11/hwy/contrib/sort/sort_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/sort_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,98 +13,76 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>  // memcpy
+
+#include <vector>
+
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/sort/vqsort.h"
 // After foreach_target
 #include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/traits128-inl.h"
 #include "hwy/contrib/sort/result-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"  // BaseCase
 #include "hwy/tests/test_util-inl.h"
 // clang-format on
 
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>  // memcpy
-
-#include <algorithm>  // std::max
-#include <vector>
-
-#undef VQSORT_TEST_IMPL
-#if (HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128) || \
-    (defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD)
-// Scalar does not implement these, and MSVC non-debug builds time out.
-#define VQSORT_TEST_IMPL 0
-#else
-#define VQSORT_TEST_IMPL 1
-#endif
-
-#undef VQSORT_TEST_SORT
-// MSVC non-debug builds time out.
-#if defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD
-#define VQSORT_TEST_SORT 0
-#else
-#define VQSORT_TEST_SORT 1
-#endif
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 namespace {
 
-#if VQSORT_TEST_IMPL || VQSORT_TEST_SORT
-using detail::TraitsLane;
 using detail::OrderAscending;
-using detail::OrderAscending128;
 using detail::OrderDescending;
-using detail::OrderDescending128;
 using detail::SharedTraits;
+using detail::TraitsLane;
+#if VQSORT_ENABLED || HWY_IDE
+using detail::OrderAscending128;
+using detail::OrderAscendingKV128;
+using detail::OrderDescending128;
+using detail::OrderDescendingKV128;
 using detail::Traits128;
-#endif
-
-#if !VQSORT_TEST_IMPL
-static void TestAllMedian() {}
-static void TestAllBaseCase() {}
-static void TestAllPartition() {}
-static void TestAllGenerator() {}
-#else
 
 template <class Traits>
 static HWY_NOINLINE void TestMedian3() {
-  using T = uint64_t;
-  using D = CappedTag<T, 1>;
+  using LaneType = typename Traits::LaneType;
+  using D = CappedTag<LaneType, 1>;
   SharedTraits<Traits> st;
   const D d;
   using V = Vec<D>;
   for (uint32_t bits = 0; bits < 8; ++bits) {
-    const V v0 = Set(d, T{(bits & (1u << 0)) ? 1u : 0u});
-    const V v1 = Set(d, T{(bits & (1u << 1)) ? 1u : 0u});
-    const V v2 = Set(d, T{(bits & (1u << 2)) ? 1u : 0u});
-    const T m = GetLane(detail::MedianOf3(st, v0, v1, v2));
+    const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u});
+    const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u});
+    const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u});
+    const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2));
     // If at least half(rounded up) of bits are 1, so is the median.
     const size_t count = PopCount(bits);
-    HWY_ASSERT_EQ((count >= 2) ? static_cast<T>(1) : 0, m);
+    HWY_ASSERT_EQ((count >= 2) ? static_cast<LaneType>(1) : 0, m);
   }
 }
 
 HWY_NOINLINE void TestAllMedian() {
-  TestMedian3<TraitsLane<OrderAscending> >();
+  TestMedian3<TraitsLane<OrderAscending<uint64_t> > >();
 }
 
-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestBaseCaseAscDesc() {
+  using LaneType = typename Traits::LaneType;
   SharedTraits<Traits> st;
-  const SortTag<T> d;
+  const SortTag<LaneType> d;
   const size_t N = Lanes(d);
   const size_t base_case_num = SortConstants::BaseCaseNum(N);
   const size_t N1 = st.LanesPerKey();
 
   constexpr int kDebug = 0;
-  auto aligned_keys = hwy::AllocateAligned<T>(N + base_case_num + N);
-  auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
+  auto aligned_lanes = hwy::AllocateAligned<LaneType>(N + base_case_num + N);
+  auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
 
   std::vector<size_t> lengths;
   lengths.push_back(HWY_MAX(1, N1));
@@ -125,43 +103,45 @@ static HWY_NOINLINE void TestBaseCaseAsc
   for (bool asc : {false, true}) {
     for (size_t len : lengths) {
       for (size_t misalign : misalignments) {
-        T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
+        LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
         if (kDebug) {
           printf("============%s asc %d N1 %d len %d misalign %d\n",
-                 hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(N1),
+                 st.KeyString().c_str(), asc, static_cast<int>(N1),
                  static_cast<int>(len), static_cast<int>(misalign));
         }
 
         for (size_t i = 0; i < misalign; ++i) {
-          aligned_keys[i] = hwy::LowestValue<T>();
+          aligned_lanes[i] = hwy::LowestValue<LaneType>();
         }
-        InputStats<T> input_stats;
+        InputStats<LaneType> input_stats;
         for (size_t i = 0; i < len; ++i) {
-          keys[i] =
-              asc ? static_cast<T>(T(i) + 1) : static_cast<T>(T(len) - T(i));
-          input_stats.Notify(keys[i]);
-          if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
+          lanes[i] = asc ? static_cast<LaneType>(LaneType(i) + 1)
+                         : static_cast<LaneType>(LaneType(len) - LaneType(i));
+          input_stats.Notify(lanes[i]);
+          if (kDebug >= 2) {
+            printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+          }
         }
         for (size_t i = len; i < base_case_num + N; ++i) {
-          keys[i] = hwy::LowestValue<T>();
+          lanes[i] = hwy::LowestValue<LaneType>();
         }
 
-        detail::BaseCase(d, st, keys, len, buf.get());
+        detail::BaseCase(d, st, lanes, lanes + len, len, buf.get());
 
         if (kDebug >= 2) {
           printf("out>>>>>>\n");
           for (size_t i = 0; i < len; ++i) {
-            printf("%3zu: %f\n", i, double(keys[i]));
+            printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
           }
         }
 
-        HWY_ASSERT(VerifySort(st, input_stats, keys, len, "BaseAscDesc"));
+        HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc"));
         for (size_t i = 0; i < misalign; ++i) {
-          if (aligned_keys[i] != hwy::LowestValue<T>())
+          if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
             HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
         }
         for (size_t i = len; i < base_case_num + N; ++i) {
-          if (keys[i] != hwy::LowestValue<T>())
+          if (lanes[i] != hwy::LowestValue<LaneType>())
             HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
         }
       }  // misalign
@@ -169,17 +149,18 @@ static HWY_NOINLINE void TestBaseCaseAsc
   }      // asc
 }
 
-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestBaseCase01() {
+  using LaneType = typename Traits::LaneType;
   SharedTraits<Traits> st;
-  const SortTag<T> d;
+  const SortTag<LaneType> d;
   const size_t N = Lanes(d);
   const size_t base_case_num = SortConstants::BaseCaseNum(N);
   const size_t N1 = st.LanesPerKey();
 
   constexpr int kDebug = 0;
-  auto keys = hwy::AllocateAligned<T>(base_case_num + N);
-  auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
+  auto lanes = hwy::AllocateAligned<LaneType>(base_case_num + N);
+  auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
 
   std::vector<size_t> lengths;
   lengths.push_back(HWY_MAX(1, N1));
@@ -191,65 +172,69 @@ static HWY_NOINLINE void TestBaseCase01(
 
   for (size_t len : lengths) {
     if (kDebug) {
-      printf("============%s 01 N1 %d len %d\n", hwy::TypeName(T(), 1).c_str(),
+      printf("============%s 01 N1 %d len %d\n", st.KeyString().c_str(),
              static_cast<int>(N1), static_cast<int>(len));
     }
     const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
     for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
-      InputStats<T> input_stats;
+      InputStats<LaneType> input_stats;
       for (size_t i = 0; i < len; ++i) {
-        keys[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
-        input_stats.Notify(keys[i]);
-        if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
+        lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
+        input_stats.Notify(lanes[i]);
+        if (kDebug >= 2) {
+          printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+        }
       }
       for (size_t i = len; i < base_case_num + N; ++i) {
-        keys[i] = hwy::LowestValue<T>();
+        lanes[i] = hwy::LowestValue<LaneType>();
       }
 
-      detail::BaseCase(d, st, keys.get(), len, buf.get());
+      detail::BaseCase(d, st, lanes.get(), lanes.get() + len, len, buf.get());
 
       if (kDebug >= 2) {
         printf("out>>>>>>\n");
         for (size_t i = 0; i < len; ++i) {
-          printf("%3zu: %f\n", i, double(keys[i]));
+          printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
         }
       }
 
-      HWY_ASSERT(VerifySort(st, input_stats, keys.get(), len, "Base01"));
+      HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01"));
       for (size_t i = len; i < base_case_num + N; ++i) {
-        if (keys[i] != hwy::LowestValue<T>())
+        if (lanes[i] != hwy::LowestValue<LaneType>())
           HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
       }
     }  // bits
   }    // len
 }
 
-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestBaseCase() {
-  TestBaseCaseAscDesc<Traits, T>();
-  TestBaseCase01<Traits, T>();
+  TestBaseCaseAscDesc<Traits>();
+  TestBaseCase01<Traits>();
 }
 
 HWY_NOINLINE void TestAllBaseCase() {
   // Workaround for stack overflow on MSVC debug.
-#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
+#if defined(_MSC_VER)
   return;
 #endif
+  // Only enable EMU128 on x86 - it's slow on emulators.
+  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
 
-  TestBaseCase<TraitsLane<OrderAscending>, int32_t>();
-  TestBaseCase<TraitsLane<OrderDescending>, int64_t>();
-  TestBaseCase<Traits128<OrderAscending128>, uint64_t>();
-  TestBaseCase<Traits128<OrderDescending128>, uint64_t>();
+  TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
+  TestBaseCase<TraitsLane<OrderDescending<int64_t> > >();
+  TestBaseCase<Traits128<OrderAscending128> >();
+  TestBaseCase<Traits128<OrderDescending128> >();
 }
 
-template <class Traits, typename T>
-static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys,
-                                         size_t left, size_t border,
-                                         size_t right, const size_t N1,
-                                         const T* pivot) {
+template <class Traits>
+static HWY_NOINLINE void VerifyPartition(
+    Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left,
+    size_t border, size_t right, const size_t N1,
+    const typename Traits::LaneType* pivot) {
   /* for (size_t i = left; i < right; ++i) {
      if (i == border) printf("--\n");
-     printf("%4zu: %3d\n", i, keys[i]);
+     printf("%4zu: %3d\n", i, lanes[i]);
    }*/
 
   HWY_ASSERT(left % N1 == 0);
@@ -257,30 +242,33 @@ static HWY_NOINLINE void VerifyPartition
   HWY_ASSERT(right % N1 == 0);
   const bool asc = typename Traits::Order().IsAscending();
   for (size_t i = left; i < border; i += N1) {
-    if (st.Compare1(pivot, keys + i)) {
+    if (st.Compare1(pivot, lanes + i)) {
       HWY_ABORT(
           "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
           "border %d",
-          hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
-          double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
-          double(keys[i + 0]), static_cast<int>(border));
+          st.KeyString().c_str(), asc, static_cast<int>(i),
+          static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
+          static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i + 0]),
+          static_cast<int>(border));
     }
   }
   for (size_t i = border; i < right; i += N1) {
-    if (!st.Compare1(pivot, keys + i)) {
+    if (!st.Compare1(pivot, lanes + i)) {
       HWY_ABORT(
           "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
           "border %d",
-          hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
-          double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
-          double(keys[i]), static_cast<int>(border));
+          st.KeyString().c_str(), asc, static_cast<int>(i),
+          static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
+          static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i]),
+          static_cast<int>(border));
     }
   }
 }
 
-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestPartition() {
-  const SortTag<T> d;
+  using LaneType = typename Traits::LaneType;
+  const SortTag<LaneType> d;
   SharedTraits<Traits> st;
   const bool asc = typename Traits::Order().IsAscending();
   const size_t N = Lanes(d);
@@ -288,8 +276,8 @@ static HWY_NOINLINE void TestPartition()
   const size_t base_case_num = SortConstants::BaseCaseNum(N);
   // left + len + align
   const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
-  auto aligned_keys = hwy::AllocateAligned<T>(total);
-  auto buf = hwy::AllocateAligned<T>(SortConstants::PartitionBufNum(N));
+  auto aligned_lanes = hwy::AllocateAligned<LaneType>(total);
+  auto buf = hwy::AllocateAligned<LaneType>(SortConstants::PartitionBufNum(N));
 
   const size_t N1 = st.LanesPerKey();
   for (bool in_asc : {false, true}) {
@@ -298,61 +286,66 @@ static HWY_NOINLINE void TestPartition()
       for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
                          2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
         const size_t len = (base_case_num + ofs) & ~(N1 - 1);
-        for (T pivot1 :
-             {T(0), T(len / 3), T(len / 2), T(2 * len / 3), T(len)}) {
-          const T pivot2[2] = {pivot1, 0};
+        for (LaneType pivot1 :
+             {LaneType(0), LaneType(len / 3), LaneType(len / 2),
+              LaneType(2 * len / 3), LaneType(len)}) {
+          const LaneType pivot2[2] = {pivot1, 0};
           const auto pivot = st.SetKey(d, pivot2);
           for (size_t misalign = 0; misalign < N;
                misalign += st.LanesPerKey()) {
-            T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
+            LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
             const size_t right = left + len;
             if (kDebug) {
               printf(
                   "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
-                  hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(left),
+                  st.KeyString().c_str(), asc, static_cast<int>(left),
                   static_cast<int>(len), static_cast<int>(right),
-                  double(pivot2[1]), double(pivot2[0]));
+                  static_cast<double>(pivot2[1]),
+                  static_cast<double>(pivot2[0]));
             }
 
             for (size_t i = 0; i < misalign; ++i) {
-              aligned_keys[i] = hwy::LowestValue<T>();
+              aligned_lanes[i] = hwy::LowestValue<LaneType>();
             }
             for (size_t i = 0; i < left; ++i) {
-              keys[i] = hwy::LowestValue<T>();
+              lanes[i] = hwy::LowestValue<LaneType>();
             }
             for (size_t i = left; i < right; ++i) {
-              keys[i] = static_cast<T>(in_asc ? T(i + 1) - static_cast<T>(left)
-                                              : static_cast<T>(right) - T(i));
-              if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
+              lanes[i] = static_cast<LaneType>(
+                  in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
+                         : static_cast<LaneType>(right) - LaneType(i));
+              if (kDebug >= 2) {
+                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+              }
             }
             for (size_t i = right; i < total - misalign; ++i) {
-              keys[i] = hwy::LowestValue<T>();
+              lanes[i] = hwy::LowestValue<LaneType>();
             }
 
             size_t border =
-                detail::Partition(d, st, keys, left, right, pivot, buf.get());
+                detail::Partition(d, st, lanes, left, right, pivot, buf.get());
 
             if (kDebug >= 2) {
               printf("out>>>>>>\n");
               for (size_t i = left; i < right; ++i) {
-                printf("%3zu: %f\n", i, double(keys[i]));
+                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
               }
               for (size_t i = right; i < total - misalign; ++i) {
-                printf("%3zu: sentinel %f\n", i, double(keys[i]));
+                printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
               }
             }
 
-            VerifyPartition(st, keys, left, border, right, N1, pivot2);
+            VerifyPartition(st, lanes, left, border, right, N1, pivot2);
             for (size_t i = 0; i < misalign; ++i) {
-              if (aligned_keys[i] != hwy::LowestValue<T>())
+              if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
                 HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
             }
             for (size_t i = 0; i < left; ++i) {
-              if (keys[i] != hwy::LowestValue<T>())
+              if (lanes[i] != hwy::LowestValue<LaneType>())
                 HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
             }
             for (size_t i = right; i < total - misalign; ++i) {
-              if (keys[i] != hwy::LowestValue<T>())
+              if (lanes[i] != hwy::LowestValue<LaneType>())
                 HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
             }
           }  // misalign
@@ -363,15 +356,18 @@ static HWY_NOINLINE void TestPartition()
 }
 
 HWY_NOINLINE void TestAllPartition() {
-  TestPartition<TraitsLane<OrderAscending>, int16_t>();
-  TestPartition<TraitsLane<OrderDescending>, int32_t>();
-  TestPartition<TraitsLane<OrderAscending>, int64_t>();
-  TestPartition<TraitsLane<OrderDescending>, float>();
+  // Only enable EMU128 on x86 - it's slow on emulators.
+  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
+
+  TestPartition<TraitsLane<OrderAscending<int16_t> > >();
+  TestPartition<TraitsLane<OrderDescending<int32_t> > >();
+  TestPartition<TraitsLane<OrderAscending<int64_t> > >();
+  TestPartition<TraitsLane<OrderDescending<float> > >();
 #if HWY_HAVE_FLOAT64
-  TestPartition<TraitsLane<OrderDescending>, double>();
+  TestPartition<TraitsLane<OrderDescending<double> > >();
 #endif
-  TestPartition<Traits128<OrderAscending128>, uint64_t>();
-  TestPartition<Traits128<OrderDescending128>, uint64_t>();
+  TestPartition<Traits128<OrderAscending128> >();
+  TestPartition<Traits128<OrderDescending128> >();
 }
 
 // (used for sample selection for choosing a pivot)
@@ -401,7 +397,7 @@ static HWY_NOINLINE void TestRandomGener
 
     // Also ensure the mean is near the middle of the range
     const double expected = (num_blocks - 1) / 2.0;
-    const double actual = double(sum) / kReps;
+    const double actual = static_cast<double>(sum) / kReps;
     HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
   }
 }
@@ -411,22 +407,26 @@ HWY_NOINLINE void TestAllGenerator() {
   TestRandomGenerator<uint64_t>();
 }
 
-#endif  // VQSORT_TEST_IMPL
-
-#if !VQSORT_TEST_SORT
-static void TestAllSort() {}
 #else
+static void TestAllMedian() {}
+static void TestAllBaseCase() {}
+static void TestAllPartition() {}
+static void TestAllGenerator() {}
+#endif  // VQSORT_ENABLED
 
 // Remembers input, and compares results to that of a reference algorithm.
-template <class Traits, typename T>
+template <class Traits>
 class CompareResults {
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+
  public:
-  void SetInput(const T* in, size_t num) {
-    copy_.resize(num);
-    memcpy(copy_.data(), in, num * sizeof(T));
+  CompareResults(const LaneType* in, size_t num_lanes) {
+    copy_.resize(num_lanes);
+    memcpy(copy_.data(), in, num_lanes * sizeof(LaneType));
   }
 
-  bool Verify(const T* output) {
+  bool Verify(const LaneType* output) {
 #if HAVE_PDQSORT
     const Algo reference = Algo::kPDQ;
 #else
@@ -434,13 +434,28 @@ class CompareResults {
 #endif
     SharedState shared;
     using Order = typename Traits::Order;
-    Run<Order>(reference, copy_.data(), copy_.size(), shared,
-               /*thread=*/0);
+    const Traits st;
+    const size_t num_keys = copy_.size() / st.LanesPerKey();
+    Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
+               shared, /*thread=*/0);
 
     for (size_t i = 0; i < copy_.size(); ++i) {
       if (copy_[i] != output[i]) {
-        fprintf(stderr, "Asc %d mismatch at %d: %A %A\n", Order().IsAscending(),
-                static_cast<int>(i), double(copy_[i]), double(output[i]));
+        if (sizeof(KeyType) == 16) {
+          fprintf(stderr,
+                  "%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n",
+                  st.KeyString().c_str(), Order().IsAscending(),
+                  static_cast<int>(i), static_cast<int>(copy_.size()),
+                  static_cast<uint64_t>(copy_[i]),
+                  static_cast<uint64_t>(output[i]));
+        } else {
+          fprintf(stderr, "Type %s Asc %d mismatch at %d of %d: ",
+                  st.KeyString().c_str(), Order().IsAscending(),
+                  static_cast<int>(i), static_cast<int>(copy_.size()));
+          PrintValue(copy_[i]);
+          PrintValue(output[i]);
+          fprintf(stderr, "\n");
+        }
         return false;
       }
     }
@@ -448,7 +463,7 @@ class CompareResults {
   }
 
  private:
-  std::vector<T> copy_;
+  std::vector<LaneType> copy_;
 };
 
 std::vector<Algo> AlgoForTest() {
@@ -469,65 +484,65 @@ std::vector<Algo> AlgoForTest() {
   };
 }
 
-template <class Traits, typename T>
-void TestSort(size_t num) {
-  // TODO(janwas): fix
-  if (HWY_TARGET == HWY_SSSE3) return;
+template <class Traits>
+void TestSort(size_t num_lanes) {
 // Workaround for stack overflow on clang-cl (/F 8388608 does not help).
-#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
+#if defined(_MSC_VER)
   return;
 #endif
+  // Only enable EMU128 on x86 - it's slow on emulators.
+  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
 
+  using Order = typename Traits::Order;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
   SharedState shared;
   SharedTraits<Traits> st;
 
   // Round up to a whole number of keys.
-  num += (st.Is128() && (num & 1));
+  num_lanes += (st.Is128() && (num_lanes & 1));
+  const size_t num_keys = num_lanes / st.LanesPerKey();
 
   constexpr size_t kMaxMisalign = 16;
-  auto aligned = hwy::AllocateAligned<T>(kMaxMisalign + num + kMaxMisalign);
+  auto aligned =
+      hwy::AllocateAligned<LaneType>(kMaxMisalign + num_lanes + kMaxMisalign);
   for (Algo algo : AlgoForTest()) {
-#if HAVE_IPS4O
-    if (st.Is128() && (algo == Algo::kIPS4O || algo == Algo::kParallelIPS4O)) {
-      continue;
-    }
-#endif
     for (Dist dist : AllDist()) {
       for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
                               size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
-        T* keys = aligned.get() + misalign;
+        LaneType* lanes = aligned.get() + misalign;
 
         // Set up red zones before/after the keys to sort
         for (size_t i = 0; i < misalign; ++i) {
-          aligned[i] = hwy::LowestValue<T>();
+          aligned[i] = hwy::LowestValue<LaneType>();
         }
         for (size_t i = 0; i < kMaxMisalign; ++i) {
-          keys[num + i] = hwy::HighestValue<T>();
+          lanes[num_lanes + i] = hwy::HighestValue<LaneType>();
         }
 #if HWY_IS_MSAN
-        __msan_poison(aligned.get(), misalign * sizeof(T));
-        __msan_poison(keys + num, kMaxMisalign * sizeof(T));
+        __msan_poison(aligned.get(), misalign * sizeof(LaneType));
+        __msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
 #endif
-        InputStats<T> input_stats = GenerateInput(dist, keys, num);
-
-        CompareResults<Traits, T> compare;
-        compare.SetInput(keys, num);
+        InputStats<LaneType> input_stats =
+            GenerateInput(dist, lanes, num_lanes);
 
-        Run<typename Traits::Order>(algo, keys, num, shared, /*thread=*/0);
-        HWY_ASSERT(compare.Verify(keys));
-        HWY_ASSERT(VerifySort(st, input_stats, keys, num, "TestSort"));
+        CompareResults<Traits> compare(lanes, num_lanes);
+        Run<Order>(algo, reinterpret_cast<KeyType*>(lanes), num_keys, shared,
+                   /*thread=*/0);
+        HWY_ASSERT(compare.Verify(lanes));
+        HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort"));
 
         // Check red zones
 #if HWY_IS_MSAN
-        __msan_unpoison(aligned.get(), misalign * sizeof(T));
-        __msan_unpoison(keys + num, kMaxMisalign * sizeof(T));
+        __msan_unpoison(aligned.get(), misalign * sizeof(LaneType));
+        __msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
 #endif
         for (size_t i = 0; i < misalign; ++i) {
-          if (aligned[i] != hwy::LowestValue<T>())
+          if (aligned[i] != hwy::LowestValue<LaneType>())
             HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
         }
-        for (size_t i = num; i < num + kMaxMisalign; ++i) {
-          if (keys[i] != hwy::HighestValue<T>())
+        for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) {
+          if (lanes[i] != hwy::HighestValue<LaneType>())
             HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
         }
       }  // misalign
@@ -536,32 +551,37 @@ void TestSort(size_t num) {
 }
 
 void TestAllSort() {
-  const size_t num = AdjustedReps(20 * 1000);
-
-  TestSort<TraitsLane<OrderAscending>, int16_t>(num);
-  TestSort<TraitsLane<OrderDescending>, uint16_t>(num);
-
-  TestSort<TraitsLane<OrderDescending>, int32_t>(num);
-  TestSort<TraitsLane<OrderDescending>, uint32_t>(num);
-
-  TestSort<TraitsLane<OrderAscending>, int64_t>(num);
-  TestSort<TraitsLane<OrderAscending>, uint64_t>(num);
-
-  // WARNING: for float types, SIMD comparisons will flush denormals to zero,
-  // causing mismatches with scalar sorts. In this test, we avoid generating
-  // denormal inputs.
-  TestSort<TraitsLane<OrderAscending>, float>(num);
+  for (int num : {129, 504, 20 * 1000, 34567}) {
+    const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
+    TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);
+
+    TestSort<TraitsLane<OrderDescending<int32_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderDescending<uint32_t> > >(num_lanes);
+
+    TestSort<TraitsLane<OrderAscending<int64_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderAscending<uint64_t> > >(num_lanes);
+
+    // WARNING: for float types, SIMD comparisons will flush denormals to
+    // zero, causing mismatches with scalar sorts. In this test, we avoid
+    // generating denormal inputs.
+    TestSort<TraitsLane<OrderAscending<float> > >(num_lanes);
 #if HWY_HAVE_FLOAT64  // protects algo-inl's GenerateRandom
-  if (Sorter::HaveFloat64()) {
-    TestSort<TraitsLane<OrderDescending>, double>(num);
-  }
+    if (Sorter::HaveFloat64()) {
+      TestSort<TraitsLane<OrderDescending<double> > >(num_lanes);
+    }
 #endif
 
-  TestSort<Traits128<OrderAscending128>, uint64_t>(num);
-  TestSort<Traits128<OrderAscending128>, uint64_t>(num);
-}
+// Our HeapSort does not support 128-bit keys.
+#if VQSORT_ENABLED
+    TestSort<Traits128<OrderAscending128> >(num_lanes);
+    TestSort<Traits128<OrderDescending128> >(num_lanes);
 
-#endif  // VQSORT_TEST_SORT
+    TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
+    TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
+#endif
+  }
+}
 
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff -pruN 0.17.0-11/hwy/contrib/sort/traits128-inl.h 1.0.0-2/hwy/contrib/sort/traits128-inl.h
--- 0.17.0-11/hwy/contrib/sort/traits128-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/traits128-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -22,6 +22,9 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
 #endif
 
+#include <string>
+
+#include "hwy/contrib/sort/shared-inl.h"
 #include "hwy/contrib/sort/vqsort.h"  // SortDescending
 #include "hwy/highway.h"
 
@@ -30,48 +33,31 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {
 
-#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
-
-struct OrderAscending128 {
-  using Order = SortAscending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
-  }
-};
-
-struct OrderDescending128 {
-  using Order = SortDescending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
-  }
-};
-
-template <class Order>
-struct Traits128 : public Order {
-  constexpr bool Is128() const { return true; }
-  constexpr size_t LanesPerKey() const { return 2; }
-};
-
-#else
+#if VQSORT_ENABLED || HWY_IDE
 
 // Highway does not provide a lane type for 128-bit keys, so we use uint64_t
 // along with an abstraction layer for single-lane vs. lane-pair, which is
 // independent of the order.
-struct Key128 {
+struct KeyAny128 {
+  constexpr bool Is128() const { return true; }
   constexpr size_t LanesPerKey() const { return 2; }
 
-  template <typename T>
-  HWY_INLINE void Swap(T* a, T* b) const {
-    const FixedTag<T, 2> d;
+  // What type bench_sort should allocate for generating inputs.
+  using LaneType = uint64_t;
+  // KeyType and KeyString are defined by derived classes.
+
+  HWY_INLINE void Swap(LaneType* a, LaneType* b) const {
+    const FixedTag<LaneType, 2> d;
     const auto temp = LoadU(d, a);
     StoreU(LoadU(d, b), d, a);
     StoreU(temp, d, b);
   }
 
+  template <class V, class M>
+  HWY_INLINE V CompressKeys(V keys, M mask) const {
+    return CompressBlocksNot(keys, mask);
+  }
+
   template <class D>
   HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
     return LoadDup128(d, key);
@@ -136,6 +122,14 @@ struct Key128 {
   }
 };
 
+// Base class shared between OrderAscending128, OrderDescending128.
+struct Key128 : public KeyAny128 {
+  // What type to pass to Sorter::operator().
+  using KeyType = hwy::uint128_t;
+
+  std::string KeyString() const { return "U128"; }
+};
+
 // Anything order-related depends on the key traits *and* the order (see
 // FirstOfLanes). We cannot implement just one Compare function because Lt128
 // only compiles if the lane type is u64. Thus we need either overloaded
@@ -146,8 +140,7 @@ struct Key128 {
 struct OrderAscending128 : public Key128 {
   using Order = SortAscending;
 
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
     return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
   }
 
@@ -172,30 +165,6 @@ struct OrderAscending128 : public Key128
     return Max128(d, a, b);
   }
 
-  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = First(d, v, SetKey(d, buf + i));
-    }
-    return v;
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = Last(d, v, SetKey(d, buf + i));
-    }
-    return v;
-  }
-
   // Same as for regular lanes because 128-bit lanes are u64.
   template <class D>
   HWY_INLINE Vec<D> FirstValue(D d) const {
@@ -211,8 +180,7 @@ struct OrderAscending128 : public Key128
 struct OrderDescending128 : public Key128 {
   using Order = SortDescending;
 
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
     return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
   }
 
@@ -237,28 +205,92 @@ struct OrderDescending128 : public Key12
     return Min128(d, a, b);
   }
 
+  // Same as for regular lanes because 128-bit lanes are u64.
   template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = First(d, v, SetKey(d, buf + i));
-    }
-    return v;
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
   }
 
   template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = Last(d, v, SetKey(d, buf + i));
-    }
-    return v;
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+};
+
+// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
+struct KeyValue128 : public KeyAny128 {
+  // What type to pass to Sorter::operator().
+  using KeyType = K64V64;
+
+  std::string KeyString() const { return "KV128"; }
+};
+
+struct OrderAscendingKV128 : public KeyValue128 {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return a[1] < b[1];
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+    return Lt128Upper(d, a, b);
+  }
+
+  // Used by CompareTop
+  template <class V>
+  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+    return Lt(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+    return Min128Upper(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+    return Max128Upper(d, a, b);
+  }
+
+  // Same as for regular lanes because 128-bit lanes are u64.
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+};
+
+struct OrderDescendingKV128 : public KeyValue128 {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return b[1] < a[1];
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+    return Lt128Upper(d, b, a);
+  }
+
+  // Used by CompareTop
+  template <class V>
+  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+    return Lt(b, a);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+    return Max128Upper(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+    return Min128Upper(d, a, b);
   }
 
   // Same as for regular lanes because 128-bit lanes are u64.
@@ -276,16 +308,21 @@ struct OrderDescending128 : public Key12
 // Shared code that depends on Order.
 template <class Base>
 class Traits128 : public Base {
-#if HWY_TARGET <= HWY_AVX2
+  // Special case for >= 256 bit vectors
+#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
   // Returns vector with only the top u64 lane valid. Useful when the next step
   // is to replicate the mask anyway.
   template <class D>
   HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
     const Base* base = static_cast<const Base*>(this);
-    const Vec<D> eqHL = VecFromMask(d, Eq(a, b));
+    const Mask<D> eqHL = Eq(a, b);
     const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
+#if HWY_TARGET == HWY_SVE_256
+    return IfThenElse(eqHL, DupEven(ltHL), ltHL);
+#else
     const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
-    return OrAnd(ltHL, eqHL, ltLX);
+    return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX);
+#endif
   }
 
   // We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
@@ -293,16 +330,42 @@ class Traits128 : public Base {
   // replicate it 4x. Only called for >= 256-bit vectors.
   template <class V>
   HWY_INLINE V ReplicateTop4x(V v) const {
-#if HWY_TARGET <= HWY_AVX3
+#if HWY_TARGET == HWY_SVE_256
+    return svdup_lane_u64(v, 3);
+#elif HWY_TARGET <= HWY_AVX3
     return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
 #else  // AVX2
     return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
 #endif
   }
-#endif
+#endif  // HWY_TARGET
 
  public:
-  constexpr bool Is128() const { return true; }
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 TFromD<D>* HWY_RESTRICT buf) const {
+    const Base* base = static_cast<const Base*>(this);
+    const size_t N = Lanes(d);
+    Store(v, d, buf);
+    v = base->SetKey(d, buf + 0);  // result must be broadcasted
+    for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
+      v = base->First(d, v, base->SetKey(d, buf + i));
+    }
+    return v;
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                TFromD<D>* HWY_RESTRICT buf) const {
+    const Base* base = static_cast<const Base*>(this);
+    const size_t N = Lanes(d);
+    Store(v, d, buf);
+    v = base->SetKey(d, buf + 0);  // result must be broadcasted
+    for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
+      v = base->Last(d, v, base->SetKey(d, buf + i));
+    }
+    return v;
+  }
 
   template <class D>
   HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
@@ -320,7 +383,7 @@ class Traits128 : public Base {
     const Base* base = static_cast<const Base*>(this);
     Vec<D> swapped = base->ReverseKeys2(d, v);
 
-#if HWY_TARGET <= HWY_AVX2
+#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
     const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
     return IfVecThenElse(select, swapped, v);
 #else
@@ -358,7 +421,7 @@ class Traits128 : public Base {
   }
 };
 
-#endif  // HWY_TARGET
+#endif  // VQSORT_ENABLED
 
 }  // namespace detail
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff -pruN 0.17.0-11/hwy/contrib/sort/traits-inl.h 1.0.0-2/hwy/contrib/sort/traits-inl.h
--- 0.17.0-11/hwy/contrib/sort/traits-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/traits-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -22,33 +22,54 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
 #endif
 
-#include "hwy/contrib/sort/disabled_targets.h"
+#include <string>
+
 #include "hwy/contrib/sort/shared-inl.h"  // SortConstants
 #include "hwy/contrib/sort/vqsort.h"      // SortDescending
 #include "hwy/highway.h"
+#include "hwy/print.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {
 
+#if VQSORT_ENABLED || HWY_IDE
+
 // Highway does not provide a lane type for 128-bit keys, so we use uint64_t
 // along with an abstraction layer for single-lane vs. lane-pair, which is
 // independent of the order.
+template <typename T>
 struct KeyLane {
+  constexpr bool Is128() const { return false; }
   constexpr size_t LanesPerKey() const { return 1; }
 
+  // What type bench_sort should allocate for generating inputs.
+  using LaneType = T;
+  // What type to pass to Sorter::operator().
+  using KeyType = T;
+
+  std::string KeyString() const {
+    char string100[100];
+    hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
+    return string100;
+  }
+
   // For HeapSort
-  template <typename T>
   HWY_INLINE void Swap(T* a, T* b) const {
     const T temp = *a;
     *a = *b;
     *b = temp;
   }
 
+  template <class V, class M>
+  HWY_INLINE V CompressKeys(V keys, M mask) const {
+    return CompressNot(keys, mask);
+  }
+
   // Broadcasts one key into a vector
   template <class D>
-  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
+  HWY_INLINE Vec<D> SetKey(D d, const T* key) const {
     return Set(d, *key);
   }
 
@@ -149,10 +170,10 @@ struct KeyLane {
 // We avoid overloaded functions because we want all functions to be callable
 // from a SortTraits without per-function wrappers. Specializing would work, but
 // we are anyway going to specialize at a higher level.
-struct OrderAscending : public KeyLane {
+template <typename T>
+struct OrderAscending : public KeyLane<T> {
   using Order = SortAscending;
 
-  template <typename T>
   HWY_INLINE bool Compare1(const T* a, const T* b) {
     return *a < *b;
   }
@@ -175,31 +196,31 @@ struct OrderAscending : public KeyLane {
 
   template <class D>
   HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                 T* HWY_RESTRICT /* buf */) const {
     return MinOfLanes(d, v);
   }
 
   template <class D>
   HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                T* HWY_RESTRICT /* buf */) const {
     return MaxOfLanes(d, v);
   }
 
   template <class D>
   HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D>>());
+    return Set(d, hwy::LowestValue<T>());
   }
 
   template <class D>
   HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D>>());
+    return Set(d, hwy::HighestValue<T>());
   }
 };
 
-struct OrderDescending : public KeyLane {
+template <typename T>
+struct OrderDescending : public KeyLane<T> {
   using Order = SortDescending;
 
-  template <typename T>
   HWY_INLINE bool Compare1(const T* a, const T* b) {
     return *b < *a;
   }
@@ -221,32 +242,30 @@ struct OrderDescending : public KeyLane
 
   template <class D>
   HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                 T* HWY_RESTRICT /* buf */) const {
     return MaxOfLanes(d, v);
   }
 
   template <class D>
   HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                T* HWY_RESTRICT /* buf */) const {
     return MinOfLanes(d, v);
   }
 
   template <class D>
   HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D>>());
+    return Set(d, hwy::HighestValue<T>());
   }
 
   template <class D>
   HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D>>());
+    return Set(d, hwy::LowestValue<T>());
   }
 };
 
 // Shared code that depends on Order.
 template <class Base>
 struct TraitsLane : public Base {
-  constexpr bool Is128() const { return false; }
-
   // For each lane i: replaces a[i] with the first and b[i] with the second
   // according to Base.
   // Corresponds to a conditional swap, which is one "node" of a sorting
@@ -316,6 +335,66 @@ struct TraitsLane : public Base {
   }
 };
 
+#else
+
+// Base class shared between OrderAscending, OrderDescending.
+template <typename T>
+struct KeyLane {
+  constexpr bool Is128() const { return false; }
+  constexpr size_t LanesPerKey() const { return 1; }
+
+  using LaneType = T;
+  using KeyType = T;
+
+  std::string KeyString() const {
+    char string100[100];
+    hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
+    return string100;
+  }
+};
+
+template <typename T>
+struct OrderAscending : public KeyLane<T> {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
+    return Lt(a, b);
+  }
+};
+
+template <typename T>
+struct OrderDescending : public KeyLane<T> {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
+    return Lt(b, a);
+  }
+};
+
+template <class Order>
+struct TraitsLane : public Order {
+  // For HeapSort
+  template <typename T>  // MSVC doesn't find typename Order::LaneType.
+  HWY_INLINE void Swap(T* a, T* b) const {
+    const T temp = *a;
+    *a = *b;
+    *b = temp;
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
+    return Set(d, *key);
+  }
+};
+
+#endif  // VQSORT_ENABLED
+
 }  // namespace detail
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_128a.cc 1.0.0-2/hwy/contrib/sort/vqsort_128a.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_128a.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_128a.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits128-inl.h"
@@ -30,9 +29,16 @@ namespace HWY_NAMESPACE {
 
 void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
                 uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
   SortTag<uint64_t> d;
   detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
   Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_128d.cc 1.0.0-2/hwy/contrib/sort/vqsort_128d.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_128d.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_128d.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits128-inl.h"
@@ -30,9 +29,16 @@ namespace HWY_NAMESPACE {
 
 void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
                  uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
   SortTag<uint64_t> d;
   detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
   Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort.cc 1.0.0-2/hwy/contrib/sort/vqsort.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort.cc	2022-07-27 11:48:16.000000000 +0000
@@ -17,11 +17,9 @@
 
 #include <string.h>  // memset
 
-#include "hwy/aligned_allocator.h"
-
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/shared-inl.h"
@@ -36,13 +34,17 @@
 #endif
 #endif  // VQSORT_STACK
 
+#if !VQSORT_STACK
+#include "hwy/aligned_allocator.h"
+#endif
+
 // Check if we have sys/random.h. First skip some systems on which the check
 // itself (features.h) might be problematic.
 #if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV
 #define VQSORT_GETRANDOM 0
 #endif
 
-#if !defined(VQSORT_GETRANDOM) && (defined(linux) || defined(__linux__))
+#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX
 #include <features.h>
 
 // ---- which libc
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_f32a.cc 1.0.0-2/hwy/contrib/sort/vqsort_f32a.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_f32a.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_f32a.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -30,7 +29,7 @@ namespace HWY_NAMESPACE {
 
 void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
   SortTag<float> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<float>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_f32d.cc 1.0.0-2/hwy/contrib/sort/vqsort_f32d.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_f32d.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_f32d.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
                  float* HWY_RESTRICT buf) {
   SortTag<float> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<float>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_f64a.cc 1.0.0-2/hwy/contrib/sort/vqsort_f64a.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_f64a.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_f64a.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -32,7 +31,7 @@ void SortF64Asc(double* HWY_RESTRICT key
                 double* HWY_RESTRICT buf) {
 #if HWY_HAVE_FLOAT64
   SortTag<double> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<double>>> st;
   Sort(d, st, keys, num, buf);
 #else
   (void)keys;
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_f64d.cc 1.0.0-2/hwy/contrib/sort/vqsort_f64d.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_f64d.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_f64d.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -32,7 +31,7 @@ void SortF64Desc(double* HWY_RESTRICT ke
                  double* HWY_RESTRICT buf) {
 #if HWY_HAVE_FLOAT64
   SortTag<double> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<double>>> st;
   Sort(d, st, keys, num, buf);
 #else
   (void)keys;
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort.h 1.0.0-2/hwy/contrib/sort/vqsort.h
--- 0.17.0-11/hwy/contrib/sort/vqsort.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort.h	2022-07-27 11:48:16.000000000 +0000
@@ -13,8 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Interface to vectorized quicksort with dynamic dispatch. Measurements and
-// detailed description: https://arxiv.org/abs/2205.05982 .
+// Interface to vectorized quicksort with dynamic dispatch.
+// Blog post: https://tinyurl.com/vqsort-blog
+// Paper with measurements: https://arxiv.org/abs/2205.05982
+//
+// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is
+// worthwhile, we recommend using this code for sorting arrays whose size is at
+// least 512 KiB.
 
 #ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
 #define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
@@ -23,15 +28,6 @@
 
 namespace hwy {
 
-// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
-// https://reviews.llvm.org/D86310
-#pragma pack(push, 1)
-struct alignas(16) uint128_t {
-  uint64_t lo;  // little-endian layout
-  uint64_t hi;
-};
-#pragma pack(pop)
-
 // Tag arguments that determine the sort order.
 struct SortAscending {
   constexpr bool IsAscending() const { return true; }
@@ -86,6 +82,9 @@ class HWY_CONTRIB_DLLEXPORT Sorter {
   void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
   void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
 
+  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
   // For internal use only
   static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
   static bool HaveFloat64();
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_i16a.cc 1.0.0-2/hwy/contrib/sort/vqsort_i16a.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_i16a.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_i16a.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"
 
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@@ -34,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
                 int16_t* HWY_RESTRICT buf) {
   SortTag<int16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int16_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
@@ -56,5 +52,3 @@ void Sorter::operator()(int16_t* HWY_RES
 
 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_i16d.cc 1.0.0-2/hwy/contrib/sort/vqsort_i16d.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_i16d.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_i16d.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"
 
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@@ -34,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
                  int16_t* HWY_RESTRICT buf) {
   SortTag<int16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int16_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
@@ -56,5 +52,3 @@ void Sorter::operator()(int16_t* HWY_RES
 
 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_i32a.cc 1.0.0-2/hwy/contrib/sort/vqsort_i32a.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_i32a.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_i32a.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
                 int32_t* HWY_RESTRICT buf) {
   SortTag<int32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int32_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_i32d.cc 1.0.0-2/hwy/contrib/sort/vqsort_i32d.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_i32d.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_i32d.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
                  int32_t* HWY_RESTRICT buf) {
   SortTag<int32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int32_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_i64a.cc 1.0.0-2/hwy/contrib/sort/vqsort_i64a.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_i64a.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_i64a.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
                 int64_t* HWY_RESTRICT buf) {
   SortTag<int64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_i64d.cc 1.0.0-2/hwy/contrib/sort/vqsort_i64d.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_i64d.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_i64d.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
                  int64_t* HWY_RESTRICT buf) {
   SortTag<int64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int64_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort-inl.h 1.0.0-2/hwy/contrib/sort/vqsort-inl.h
--- 0.17.0-11/hwy/contrib/sort/vqsort-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -29,8 +29,7 @@
 
 #include <string.h>  // memcpy
 
-#include "hwy/cache_control.h"  // Prefetch
-#include "hwy/contrib/sort/disabled_targets.h"
+#include "hwy/cache_control.h"        // Prefetch
 #include "hwy/contrib/sort/vqsort.h"  // Fill24Bytes
 
 #if HWY_IS_MSAN
@@ -57,117 +56,66 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {
 
-#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
+using Constants = hwy::SortConstants;
 
-template <typename T>
-void Swap(T* a, T* b) {
-  T t = *a;
-  *a = *b;
-  *b = t;
-}
+// ------------------------------ HeapSort
 
-// Scalar version of HeapSort (see below)
 template <class Traits, typename T>
-void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
-  if (num < 2) return;
+void SiftDown(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes,
+              size_t start) {
+  constexpr size_t N1 = st.LanesPerKey();
+  const FixedTag<T, N1> d;
 
-  // Build heap.
-  for (size_t i = 1; i < num; i += 1) {
-    size_t j = i;
-    while (j != 0) {
-      const size_t idx_parent = ((j - 1) / 1 / 2);
-      if (!st.Compare1(keys + idx_parent, keys + j)) {
-        break;
-      }
-      Swap(keys + j, keys + idx_parent);
-      j = idx_parent;
+  while (start < num_lanes) {
+    const size_t left = 2 * start + N1;
+    const size_t right = 2 * start + 2 * N1;
+    if (left >= num_lanes) break;
+    size_t idx_larger = start;
+    const auto key_j = st.SetKey(d, lanes + start);
+    if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, lanes + left)))) {
+      idx_larger = left;
     }
-  }
-
-  for (size_t i = num - 1; i != 0; i -= 1) {
-    // Swap root with last
-    Swap(keys + 0, keys + i);
-
-    // Sift down the new root.
-    size_t j = 0;
-    while (j < i) {
-      const size_t left = 2 * j + 1;
-      const size_t right = 2 * j + 2;
-      if (left >= i) break;
-      size_t idx_larger = j;
-      if (st.Compare1(keys + j, keys + left)) {
-        idx_larger = left;
-      }
-      if (right < i && st.Compare1(keys + idx_larger, keys + right)) {
-        idx_larger = right;
-      }
-      if (idx_larger == j) break;
-      Swap(keys + j, keys + idx_larger);
-      j = idx_larger;
+    if (right < num_lanes &&
+        AllTrue(d, st.Compare(d, st.SetKey(d, lanes + idx_larger),
+                              st.SetKey(d, lanes + right)))) {
+      idx_larger = right;
     }
+    if (idx_larger == start) break;
+    st.Swap(lanes + start, lanes + idx_larger);
+    start = idx_larger;
   }
 }
 
-#else
-
-using Constants = hwy::SortConstants;
-
-// ------------------------------ HeapSort
-
 // Heapsort: O(1) space, O(N*logN) worst-case comparisons.
 // Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
 template <class Traits, typename T>
-void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
+void HeapSort(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes) {
   constexpr size_t N1 = st.LanesPerKey();
-  const FixedTag<T, N1> d;
 
-  if (num < 2 * N1) return;
+  if (num_lanes < 2 * N1) return;
 
   // Build heap.
-  for (size_t i = N1; i < num; i += N1) {
-    size_t j = i;
-    while (j != 0) {
-      const size_t idx_parent = ((j - N1) / N1 / 2) * N1;
-      if (AllFalse(d, st.Compare(d, st.SetKey(d, keys + idx_parent),
-                                 st.SetKey(d, keys + j)))) {
-        break;
-      }
-      st.Swap(keys + j, keys + idx_parent);
-      j = idx_parent;
-    }
+  for (size_t i = ((num_lanes - N1) / N1 / 2) * N1; i != (~N1 + 1); i -= N1) {
+    SiftDown(st, lanes, num_lanes, i);
   }
 
-  for (size_t i = num - N1; i != 0; i -= N1) {
+  for (size_t i = num_lanes - N1; i != 0; i -= N1) {
     // Swap root with last
-    st.Swap(keys + 0, keys + i);
+    st.Swap(lanes + 0, lanes + i);
 
     // Sift down the new root.
-    size_t j = 0;
-    while (j < i) {
-      const size_t left = 2 * j + N1;
-      const size_t right = 2 * j + 2 * N1;
-      if (left >= i) break;
-      size_t idx_larger = j;
-      const auto key_j = st.SetKey(d, keys + j);
-      if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, keys + left)))) {
-        idx_larger = left;
-      }
-      if (right < i && AllTrue(d, st.Compare(d, st.SetKey(d, keys + idx_larger),
-                                             st.SetKey(d, keys + right)))) {
-        idx_larger = right;
-      }
-      if (idx_larger == j) break;
-      st.Swap(keys + j, keys + idx_larger);
-      j = idx_larger;
-    }
+    SiftDown(st, lanes, i, 0);
   }
 }
 
+#if VQSORT_ENABLED || HWY_IDE
+
 // ------------------------------ BaseCase
 
 // Sorts `keys` within the range [0, num) via sorting network.
 template <class D, class Traits, typename T>
-HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
+HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys,
+                           T* HWY_RESTRICT keys_end, size_t num,
                            T* HWY_RESTRICT buf) {
   const size_t N = Lanes(d);
   using V = decltype(Zero(d));
@@ -185,6 +133,18 @@ HWY_NOINLINE void BaseCase(D d, Traits s
       HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2);
   HWY_DASSERT(cols <= N);
 
+  // We can avoid padding and load/store directly to `keys` after checking the
+  // original input array has enough space. Except at the right border, it's OK
+  // to sort more than the current sub-array. Even if we sort across a previous
+  // partition point, we know that keys will not migrate across it. However, we
+  // must use the maximum size of the sorting network, because the StoreU of its
+  // last vector would otherwise write invalid data starting at kMaxRows * cols.
+  const size_t N_sn = Lanes(CappedTag<T, Constants::kMaxCols>());
+  if (HWY_LIKELY(keys + N_sn * Constants::kMaxRows <= keys_end)) {
+    SortingNetwork(st, keys, N_sn);
+    return;
+  }
+
   // Copy `keys` to `buf`.
   size_t i;
   for (i = 0; i + N <= num; i += N) {
@@ -266,35 +226,34 @@ HWY_NOINLINE void PartitionToMultipleOfU
 template <class D, class Traits, typename T>
 HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v,
                                const Vec<D> pivot, T* HWY_RESTRICT keys,
-                               size_t& writeL, size_t& writeR) {
+                               size_t& writeL, size_t& remaining) {
   const size_t N = Lanes(d);
 
   const auto comp = st.Compare(d, pivot, v);
 
-  if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value) {
+  remaining -= N;
+  if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value ||
+      (HWY_MAX_BYTES == 16 && st.Is128())) {
     // Non-native Compress (e.g. AVX2): we are able to partition a vector using
     // a single Compress+two StoreU instead of two Compress[Blended]Store. The
     // latter are more expensive. Because we store entire vectors, the contents
     // between the updated writeL and writeR are ignored and will be overwritten
     // by subsequent calls. This works because writeL and writeR are at least
     // two vectors apart.
-    const auto mask = Not(comp);
-    const auto lr = Compress(v, mask);
-    const size_t num_left = CountTrue(d, mask);
+    const auto lr = st.CompressKeys(v, comp);
+    const size_t num_left = N - CountTrue(d, comp);
     StoreU(lr, d, keys + writeL);
-    writeL += num_left;
     // Now write the right-side elements (if any), such that the previous writeR
     // is one past the end of the newly written right elements, then advance.
-    StoreU(lr, d, keys + writeR - N);
-    writeR -= (N - num_left);
+    StoreU(lr, d, keys + remaining + writeL);
+    writeL += num_left;
   } else {
     // Native Compress[Store] (e.g. AVX3), which only keep the left or right
     // side, not both, hence we require two calls.
     const size_t num_left = CompressStore(v, Not(comp), d, keys + writeL);
     writeL += num_left;
 
-    writeR -= (N - num_left);
-    (void)CompressBlendedStore(v, comp, d, keys + writeR);
+    (void)CompressBlendedStore(v, comp, d, keys + remaining + writeL);
   }
 }
 
@@ -303,11 +262,11 @@ HWY_INLINE void StoreLeftRight4(D d, Tra
                                 const Vec<D> v1, const Vec<D> v2,
                                 const Vec<D> v3, const Vec<D> pivot,
                                 T* HWY_RESTRICT keys, size_t& writeL,
-                                size_t& writeR) {
-  StoreLeftRight(d, st, v0, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v1, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v2, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v3, pivot, keys, writeL, writeR);
+                                size_t& remaining) {
+  StoreLeftRight(d, st, v0, pivot, keys, writeL, remaining);
+  StoreLeftRight(d, st, v1, pivot, keys, writeL, remaining);
+  StoreLeftRight(d, st, v2, pivot, keys, writeL, remaining);
+  StoreLeftRight(d, st, v3, pivot, keys, writeL, remaining);
 }
 
 // Moves "<= pivot" keys to the front, and others to the back. pivot is
@@ -332,9 +291,39 @@ HWY_NOINLINE size_t Partition(D d, Trait
   PartitionToMultipleOfUnroll(d, st, keys, left, right, pivot, buf);
   constexpr size_t kUnroll = Constants::kPartitionUnroll;
 
-  // Invariant: [left, writeL) and [writeR, right) are already partitioned.
+  // Partition splits the vector into 3 sections, left to right: Elements
+  // smaller or equal to the pivot, unpartitioned elements and elements larger 
+  // than the pivot. To write elements unconditionally on the loop body without
+  // overwriting existing data, we maintain two regions of the loop where all
+  // elements have been copied elsewhere (e.g. vector registers.). I call these
+  // bufferL and bufferR, for left and right respectively.
+  //
+  // These regions are tracked by the indices (writeL, writeR, left, right) as
+  // presented in the diagram below.
+  //
+  //              writeL                                  writeR
+  //               \/                                       \/
+  //  |  <= pivot   | bufferL |   unpartitioned   | bufferR |   > pivot   |
+  //                          \/                  \/
+  //                         left                 right
+  //
+  // In the main loop body below we choose a side, load some elements out of the
+  // vector and move either `left` or `right`. Next we call into StoreLeftRight
+  // to partition the data, and the partitioned elements will be written either
+  // to writeR or writeL and the corresponding index will be moved accordingly.
+  //
+  // Note that writeR is not explicitly tracked as an optimization for platforms
+  // with conditional operations. Instead we track writeL and the number of
+  // elements left to process (`remaining`). From the diagram above we can see
+  // that:
+  //    writeR - writeL = remaining => writeR = remaining + writeL
+  //
+  // Tracking `remaining` is advantageous because each iteration reduces the
+  // number of unpartitioned elements by a fixed amount, so we can compute
+  // `remaining` without data dependencies.
+  //
   size_t writeL = left;
-  size_t writeR = right;
+  size_t remaining = right - left;
 
   const size_t num = right - left;
   // Cannot load if there were fewer than 2 * kUnroll * N.
@@ -358,12 +347,33 @@ HWY_NOINLINE size_t Partition(D d, Trait
     while (left != right) {
       V v0, v1, v2, v3;
 
-      // Free up capacity for writing by loading from the side that has less.
       // Data-dependent but branching is faster than forcing branch-free.
       const size_t capacityL = left - writeL;
-      const size_t capacityR = writeR - right;
-      HWY_DASSERT(capacityL <= num && capacityR <= num);  // >= 0
-      if (capacityR < capacityL) {
+      HWY_DASSERT(capacityL <= num);  // >= 0
+      // Load data from the end of the vector with less data (front or back).
+      // The next paragraphs explain how this works.
+      //
+      // let block_size = (kUnroll * N)
+      // On the loop prelude we load block_size elements from the front of the
+      // vector and an additional block_size elements from the back. On each
+      // iteration k elements are written to the front of the vector and
+      // (block_size - k) to the back.
+      //
+      // This creates a loop invariant where the capacity on the front
+      // (capacityL) and on the back (capacityR) always add to 2 * block_size.
+      // In other words:
+      //    capacityL + capacityR = 2 * block_size
+      //    capacityR = 2 * block_size - capacityL
+      //
+      // This means that:
+      //    capacityL < capacityR <=>
+      //    capacityL < 2 * block_size - capacityL <=>
+      //    2 * capacityL < 2 * block_size <=>
+      //    capacityL < block_size
+      //
+      // Thus the check on the next line is equivalent to capacityL > capacityR.
+      //
+      if (kUnroll * N < capacityL) {
         right -= kUnroll * N;
         v0 = LoadU(d, keys + right + 0 * N);
         v1 = LoadU(d, keys + right + 1 * N);
@@ -379,16 +389,16 @@ HWY_NOINLINE size_t Partition(D d, Trait
         hwy::Prefetch(keys + left + 3 * kUnroll * N);
       }
 
-      StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, writeR);
+      StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, remaining);
     }
 
     // Now finish writing the initial left/right to the middle.
-    StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, writeR);
-    StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, writeR);
+    StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, remaining);
+    StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, remaining);
   }
 
   // We have partitioned [left, right) such that writeL is the boundary.
-  HWY_DASSERT(writeL == writeR);
+  HWY_DASSERT(remaining == 0);
   // Make space for inserting vlast: move up to N of the first right-side keys
   // into the unused space starting at last. If we have fewer, ensure they are
   // the last items in that vector by subtracting from the *load* address,
@@ -606,9 +616,9 @@ HWY_NOINLINE void ScanMinMax(D d, Traits
 }
 
 template <class D, class Traits, typename T>
-void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin,
-             const size_t end, const Vec<D> pivot, T* HWY_RESTRICT buf,
-             Generator& rng, size_t remaining_levels) {
+void Recurse(D d, Traits st, T* HWY_RESTRICT keys, T* HWY_RESTRICT keys_end,
+             const size_t begin, const size_t end, const Vec<D> pivot,
+             T* HWY_RESTRICT buf, Generator& rng, size_t remaining_levels) {
   HWY_DASSERT(begin + 1 < end);
   const size_t num = end - begin;  // >= 2
 
@@ -640,22 +650,24 @@ void Recurse(D d, Traits st, T* HWY_REST
 
     // Separate recursion to make sure that we don't pick `last` as the
     // pivot - that would again lead to a degenerate partition.
-    Recurse(d, st, keys, begin, end, first, buf, rng, remaining_levels - 1);
+    Recurse(d, st, keys, keys_end, begin, end, first, buf, rng,
+            remaining_levels - 1);
     return;
   }
 
   if (HWY_UNLIKELY(num_left <= base_case_num)) {
-    BaseCase(d, st, keys + begin, static_cast<size_t>(num_left), buf);
+    BaseCase(d, st, keys + begin, keys_end, static_cast<size_t>(num_left), buf);
   } else {
     const Vec<D> next_pivot = ChoosePivot(d, st, keys, begin, bound, buf, rng);
-    Recurse(d, st, keys, begin, bound, next_pivot, buf, rng,
+    Recurse(d, st, keys, keys_end, begin, bound, next_pivot, buf, rng,
             remaining_levels - 1);
   }
   if (HWY_UNLIKELY(num_right <= base_case_num)) {
-    BaseCase(d, st, keys + bound, static_cast<size_t>(num_right), buf);
+    BaseCase(d, st, keys + bound, keys_end, static_cast<size_t>(num_right),
+             buf);
   } else {
     const Vec<D> next_pivot = ChoosePivot(d, st, keys, bound, end, buf, rng);
-    Recurse(d, st, keys, bound, end, next_pivot, buf, rng,
+    Recurse(d, st, keys, keys_end, bound, end, next_pivot, buf, rng,
             remaining_levels - 1);
   }
 }
@@ -670,7 +682,7 @@ bool HandleSpecialCases(D d, Traits st,
   // 128-bit keys require vectors with at least two u64 lanes, which is always
   // the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the
   // hardware vector width is less than 128bit / fraction.
-  const bool partial_128 = N < 2 && st.Is128();
+  const bool partial_128 = !IsFull(d) && N < 2 && st.Is128();
   // Partition assumes its input is at least two vectors. If vectors are huge,
   // base_case_num may actually be smaller. If so, which is only possible on
   // RVV, pass a capped or partial d (LMUL < 1). Use HWY_MAX_BYTES instead of
@@ -686,7 +698,7 @@ bool HandleSpecialCases(D d, Traits st,
 
   // Small arrays: use sorting network, no need for other checks.
   if (HWY_UNLIKELY(num <= base_case_num)) {
-    BaseCase(d, st, keys, num, buf);
+    BaseCase(d, st, keys, keys + num, num, buf);
     return true;
   }
 
@@ -696,7 +708,7 @@ bool HandleSpecialCases(D d, Traits st,
   return false;  // not finished sorting
 }
 
-#endif  // HWY_TARGET
+#endif  // VQSORT_ENABLED
 }  // namespace detail
 
 // Sorts `keys[0..num-1]` according to the order defined by `st.Compare`.
@@ -713,12 +725,7 @@ bool HandleSpecialCases(D d, Traits st,
 template <class D, class Traits, typename T>
 void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
           T* HWY_RESTRICT buf) {
-#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
-  (void)d;
-  (void)buf;
-  // PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
-  return detail::HeapSort(st, keys, num);
-#else
+#if VQSORT_ENABLED || HWY_IDE
 #if !HWY_HAVE_SCALABLE
   // On targets with fixed-size vectors, avoid _using_ the allocated memory.
   // We avoid (potentially expensive for small input sizes) allocations on
@@ -745,8 +752,13 @@ void Sort(D d, Traits st, T* HWY_RESTRIC
   // Introspection: switch to worst-case N*logN heapsort after this many.
   const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
 
-  detail::Recurse(d, st, keys, 0, num, pivot, buf, rng, max_levels);
-#endif  // HWY_TARGET
+  detail::Recurse(d, st, keys, keys + num, 0, num, pivot, buf, rng, max_levels);
+#else
+  (void)d;
+  (void)buf;
+  // PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
+  return detail::HeapSort(st, keys, num);
+#endif  // VQSORT_ENABLED
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_kv128a.cc 1.0.0-2/hwy/contrib/sort/vqsort_kv128a.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_kv128a.cc	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_kv128a.cc	2022-07-27 11:48:16.000000000 +0000
@@ -0,0 +1,65 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
+                  uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::Traits128<detail::OrderAscendingKV128>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV128Asc);
+}  // namespace
+
+void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV128Asc)
+  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_kv128d.cc 1.0.0-2/hwy/contrib/sort/vqsort_kv128d.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_kv128d.cc	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_kv128d.cc	2022-07-27 11:48:16.000000000 +0000
@@ -0,0 +1,65 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
+                   uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::Traits128<detail::OrderDescendingKV128>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV128Desc);
+}  // namespace
+
+void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV128Desc)
+  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_u16a.cc 1.0.0-2/hwy/contrib/sort/vqsort_u16a.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_u16a.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_u16a.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"
 
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@@ -34,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
                 uint16_t* HWY_RESTRICT buf) {
   SortTag<uint16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint16_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
@@ -56,5 +52,3 @@ void Sorter::operator()(uint16_t* HWY_RE
 
 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_u16d.cc 1.0.0-2/hwy/contrib/sort/vqsort_u16d.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_u16d.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_u16d.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"
 
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@@ -34,7 +30,8 @@ namespace HWY_NAMESPACE {
 void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
                  uint16_t* HWY_RESTRICT buf) {
   SortTag<uint16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint16_t>>>
+      st;
   Sort(d, st, keys, num, buf);
 }
 
@@ -56,5 +53,3 @@ void Sorter::operator()(uint16_t* HWY_RE
 
 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_u32a.cc 1.0.0-2/hwy/contrib/sort/vqsort_u32a.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_u32a.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_u32a.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
                 uint32_t* HWY_RESTRICT buf) {
   SortTag<uint32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint32_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_u32d.cc 1.0.0-2/hwy/contrib/sort/vqsort_u32d.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_u32d.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_u32d.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,8 @@ namespace HWY_NAMESPACE {
 void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
                  uint32_t* HWY_RESTRICT buf) {
   SortTag<uint32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint32_t>>>
+      st;
   Sort(d, st, keys, num, buf);
 }
 
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_u64a.cc 1.0.0-2/hwy/contrib/sort/vqsort_u64a.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_u64a.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_u64a.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
                 uint64_t* HWY_RESTRICT buf) {
   SortTag<uint64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint64_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff -pruN 0.17.0-11/hwy/contrib/sort/vqsort_u64d.cc 1.0.0-2/hwy/contrib/sort/vqsort_u64d.cc
--- 0.17.0-11/hwy/contrib/sort/vqsort_u64d.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/contrib/sort/vqsort_u64d.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,8 @@ namespace HWY_NAMESPACE {
 void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
                  uint64_t* HWY_RESTRICT buf) {
   SortTag<uint64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint64_t>>>
+      st;
   Sort(d, st, keys, num, buf);
 }
 
diff -pruN 0.17.0-11/hwy/detect_compiler_arch.h 1.0.0-2/hwy/detect_compiler_arch.h
--- 0.17.0-11/hwy/detect_compiler_arch.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/detect_compiler_arch.h	2022-07-27 11:48:16.000000000 +0000
@@ -30,9 +30,8 @@
 //------------------------------------------------------------------------------
 // Compiler
 
-// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
-// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
-// purpose.
+// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like
+// MSVC in other aspects (e.g. HWY_DIAGNOSTICS).
 #if defined(_MSC_VER) && !defined(__clang__)
 #define HWY_COMPILER_MSVC _MSC_VER
 #else
@@ -51,19 +50,25 @@
 #define HWY_COMPILER_ICC 0
 #endif
 
+// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
+// compiler extensions (eg. Clang, Intel...)
 #ifdef __GNUC__
 #define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
 #else
 #define HWY_COMPILER_GCC 0
 #endif
 
-// Clang can masquerade as MSVC/GCC, in which case both are set.
+// Clang or clang-cl, not GCC.
 #ifdef __clang__
-#ifdef __APPLE__
-// Apple LLVM version is unrelated to the actual Clang version, which we need
-// for enabling workarounds. Use the presence of warning flags to deduce it.
+// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
+// an invalid version number, deduce it from the presence of warnings.
 // Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
-#if __has_warning("-Wformat-insufficient-args")
+#if defined(__APPLE__) || __clang_major__ >= 999
+#if __has_warning("-Wbitwise-instead-of-logical")
+#define HWY_COMPILER_CLANG 1400
+#elif __has_warning("-Wreserved-identifier")
+#define HWY_COMPILER_CLANG 1300
+#elif __has_warning("-Wformat-insufficient-args")
 #define HWY_COMPILER_CLANG 1200
 #elif __has_warning("-Wimplicit-const-int-float-conversion")
 #define HWY_COMPILER_CLANG 1100
@@ -79,19 +84,32 @@
 #else  // Anything older than 7.0 is not recommended for Highway.
 #define HWY_COMPILER_CLANG 600
 #endif  // __has_warning chain
-#else   // Non-Apple: normal version
+#else   // use normal version
 #define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
 #endif
 #else  // Not clang
 #define HWY_COMPILER_CLANG 0
 #endif
 
+#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
+#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
+#else
+#define HWY_COMPILER_GCC_ACTUAL 0
+#endif
+
 // More than one may be nonzero, but we want at least one.
-#if !HWY_COMPILER_MSVC && !HWY_COMPILER_CLANGCL && !HWY_COMPILER_ICC && \
-    !HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
+#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
+          HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
 #error "Unsupported compiler"
 #endif
 
+// We should only detect one of these (only clang/clangcl overlap)
+#if 1 <                                                                     \
+    (!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
+     !!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
+#error "Detected multiple compilers"
+#endif
+
 #ifdef __has_builtin
 #define HWY_HAS_BUILTIN(name) __has_builtin(name)
 #else
@@ -147,7 +165,7 @@
 #define HWY_ARCH_ARM_A64 0
 #endif
 
-#if defined(__arm__) || defined(_M_ARM)
+#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7)
 #define HWY_ARCH_ARM_V7 1
 #else
 #define HWY_ARCH_ARM_V7 0
@@ -157,12 +175,20 @@
 #error "Cannot have both A64 and V7"
 #endif
 
+// Any *supported* version of Arm, i.e. 7 or later
 #if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
 #define HWY_ARCH_ARM 1
 #else
 #define HWY_ARCH_ARM 0
 #endif
 
+// Older than v7 (e.g. armel aka Arm v5), in which case we do not support SIMD.
+#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM
+#define HWY_ARCH_ARM_OLD 1
+#else
+#define HWY_ARCH_ARM_OLD 0
+#endif
+
 #if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
 #define HWY_ARCH_WASM 1
 #else
@@ -177,9 +203,21 @@
 
 // It is an error to detect multiple architectures at the same time, but OK to
 // detect none of the above.
-#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
-     HWY_ARCH_RVV) > 1
+#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
+     HWY_ARCH_WASM + HWY_ARCH_RVV) > 1
 #error "Must not detect more than one architecture"
 #endif
 
+#if defined(_WIN32) || defined(_WIN64)
+#define HWY_OS_WIN 1
+#else
+#define HWY_OS_WIN 0
+#endif
+
+#if defined(linux) || defined(__linux__)
+#define HWY_OS_LINUX 1
+#else
+#define HWY_OS_LINUX 0
+#endif
+
 #endif  // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
diff -pruN 0.17.0-11/hwy/detect_targets.h 1.0.0-2/hwy/detect_targets.h
--- 0.17.0-11/hwy/detect_targets.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/detect_targets.h	2022-07-27 11:48:16.000000000 +0000
@@ -51,58 +51,69 @@
 // All values are unconditionally defined so we can test HWY_TARGETS without
 // first checking the HWY_ARCH_*.
 //
-// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
-// can use 32-bit literals.
-
-// 1,2: reserved
+// The C99 preprocessor evaluates #if expressions using intmax_t types. This
+// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
+// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We can
+// thus use 63 bits (avoids overflow when computing HWY_TARGETS).
 
+// --------------------------- x86: 15 targets (+ one fallback)
+// Bits 0..6 reserved (7 targets)
 // Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
 // VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
 // Tiger Lake? We do not yet have uses for GFNI.
-#define HWY_AVX3_DL 4  // see HWY_WANT_AVX3_DL below
-#define HWY_AVX3 8
-#define HWY_AVX2 16
-// 32: reserved for AVX
-#define HWY_SSE4 64
-#define HWY_SSSE3 128
-// 0x100, 0x200: reserved for SSE3, SSE2
-
+#define HWY_AVX3_DL (1LL << 7)  // see HWY_WANT_AVX3_DL below
+#define HWY_AVX3 (1LL << 8)
+#define HWY_AVX2 (1LL << 9)
+// Bit 10: reserved for AVX
+#define HWY_SSE4 (1LL << 11)
+#define HWY_SSSE3 (1LL << 12)
+// Bits 13..14 reserved for SSE3 or SSE2 (2 targets)
 // The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
 // dynamic dispatch. All x86 target bits must be lower or equal to
 // (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
 // HWY_MAX_DYNAMIC_TARGETS in total.
-#define HWY_HIGHEST_TARGET_BIT_X86 9
-
-#define HWY_SVE2 0x400
-#define HWY_SVE 0x800
-// 0x1000 reserved for Helium
-#define HWY_NEON 0x2000
-
-#define HWY_HIGHEST_TARGET_BIT_ARM 13
-
-// 0x4000, 0x8000 reserved
-#define HWY_PPC8 0x10000  // v2.07 or 3
-// 0x20000, 0x40000 reserved for prior VSX/AltiVec
-
-#define HWY_HIGHEST_TARGET_BIT_PPC 18
-
-#define HWY_WASM2 0x80000  // Experimental
-#define HWY_WASM 0x100000
-
-#define HWY_HIGHEST_TARGET_BIT_WASM 20
-
-// 0x200000, 0x400000, 0x800000 reserved
-
-#define HWY_RVV 0x1000000
-
-#define HWY_HIGHEST_TARGET_BIT_RVV 24
-
-// 0x2000000, 0x4000000, 0x8000000 reserved
-
-#define HWY_EMU128 0x10000000
-#define HWY_SCALAR 0x20000000
+#define HWY_HIGHEST_TARGET_BIT_X86 14
 
-#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
+// --------------------------- Arm: 15 targets (+ one fallback)
+// Bits 15..23 reserved (9 targets)
+#define HWY_SVE2_128 (1LL << 24)  // specialized target (e.g. Arm N2)
+#define HWY_SVE_256 (1LL << 25)   // specialized target (e.g. Arm V1)
+#define HWY_SVE2 (1LL << 26)
+#define HWY_SVE (1LL << 27)
+#define HWY_NEON (1LL << 28)  // On A64, includes/requires AES
+// Bit 29 reserved (Helium?)
+#define HWY_HIGHEST_TARGET_BIT_ARM 29
+
+// --------------------------- RISC-V: 9 targets (+ one fallback)
+// Bits 30..36 reserved (7 targets)
+#define HWY_RVV (1LL << 37)
+// Bit 38 reserved
+#define HWY_HIGHEST_TARGET_BIT_RVV 38
+
+// --------------------------- Future expansion: 4 targets
+// Bits 39..42 reserved
+
+
+// --------------------------- IBM Power: 9 targets (+ one fallback)
+// Bits 43..48 reserved (6 targets)
+#define HWY_PPC8 (1LL << 49)  // v2.07 or 3
+// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
+#define HWY_HIGHEST_TARGET_BIT_PPC 51
+
+// --------------------------- WebAssembly: 9 targets (+ one fallback)
+// Bits 52..57 reserved (6 targets)
+#define HWY_WASM_EMU256 (1LL << 58)  // Experimental
+#define HWY_WASM (1LL << 59)
+// Bits 60 reserved
+#define HWY_HIGHEST_TARGET_BIT_WASM 60
+
+// --------------------------- Emulation: 2 targets
+
+#define HWY_EMU128 (1LL << 61)
+// Only used if HWY_COMPILE_ONLY_SCALAR, which disables the 2LL *
+// HWY_STATIC_TARGET case, so int64_t overflow will not occur.
+#define HWY_SCALAR (1LL << 62)
+#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
 
 // Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
 
@@ -144,9 +155,9 @@
 #define HWY_BROKEN_TARGETS (HWY_NEON)
 
 // SVE[2] require recent clang or gcc versions.
-#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) ||\
-(!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC < 1000)
-#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2)
+#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
+    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
+#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
 
 #else
 #define HWY_BROKEN_TARGETS 0
@@ -158,6 +169,19 @@
 #define HWY_ENABLED(targets) \
   ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
 
+// Opt-out for EMU128 (affected by a GCC <12 bug on ARMv7: see
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106187). This is separate from
+// HWY_BROKEN_TARGETS because it affects the fallback target, which must always
+// be enabled. If 1, we instead choose HWY_SCALAR even without
+// HWY_COMPILE_ONLY_SCALAR being set.
+#if !defined(HWY_BROKEN_EMU128)  // allow overriding
+#if HWY_ARCH_ARM_V7 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1140
+#define HWY_BROKEN_EMU128 1
+#else
+#define HWY_BROKEN_EMU128 0
+#endif
+#endif  // HWY_BROKEN_EMU128
+
 //------------------------------------------------------------------------------
 // Detect baseline targets using predefined macros
 
@@ -165,7 +189,7 @@
 // instructions, implying the target CPU would have to support them. This does
 // not take the blocklist into account.
 
-#if defined(HWY_COMPILE_ONLY_SCALAR)
+#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
 #define HWY_BASELINE_SCALAR HWY_SCALAR
 #else
 #define HWY_BASELINE_SCALAR HWY_EMU128
@@ -176,7 +200,7 @@
 
 #if HWY_ARCH_WASM && defined(__wasm_simd128__)
 #if defined(HWY_WANT_WASM2)
-#define HWY_BASELINE_WASM HWY_WASM2
+#define HWY_BASELINE_WASM HWY_WASM_EMU256
 #else
 #define HWY_BASELINE_WASM HWY_WASM
 #endif  // HWY_WANT_WASM2
@@ -198,6 +222,11 @@
 #endif
 
 #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
+// Baseline targets can be used unconditionally, which does not apply to
+// HWY_SVE_256 because it requires a vector size of 256 bits. Including SVE_256
+// in the baseline would also disable all 'worse' targets (including SVE and
+// SVE2) in non-test builds. Therefore we instead add HWY_SVE_256 to
+// HWY_ATTAINABLE_TARGETS below.
 #define HWY_BASELINE_SVE HWY_SVE
 #else
 #define HWY_BASELINE_SVE 0
@@ -211,7 +240,7 @@
 #endif
 
 // Special handling for MSVC because it has fewer predefined macros:
-#if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
+#if HWY_COMPILER_MSVC
 
 // 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
 //    https://stackoverflow.com/questions/18563978/.
@@ -337,10 +366,19 @@
 //------------------------------------------------------------------------------
 // Choose targets for dynamic dispatch according to one of four policies
 
-#if defined(HWY_COMPILE_ONLY_SCALAR) && defined(HWY_COMPILE_ONLY_STATIC)
-#error "Defined both HWY_COMPILE_ONLY_{SCALAR|STATIC} - bug?"
+#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
+         defined(HWY_COMPILE_ONLY_STATIC))
+#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
+#endif
+// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
+
+// x86 compilers generally allow runtime dispatch. On Arm, currently only GCC
+// does, and we require Linux to detect CPU capabilities.
+#if HWY_ARCH_X86 || (HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX)
+#define HWY_HAVE_RUNTIME_DISPATCH 1
+#else
+#define HWY_HAVE_RUNTIME_DISPATCH 0
 #endif
-// Defining either HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
 
 // AVX3_DL is not widely available yet. To reduce code size and compile time,
 // only include it in the set of attainable targets (for dynamic dispatch) if
@@ -351,19 +389,45 @@
 #define HWY_ATTAINABLE_AVX3_DL 0
 #endif
 
+#if HWY_ARCH_ARM_A64 && \
+    ((HWY_ENABLED_BASELINE & HWY_SVE) || HWY_HAVE_RUNTIME_DISPATCH)
+#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
+#else
+#define HWY_ATTAINABLE_SVE 0
+#endif
+
+#if HWY_ARCH_ARM_A64 && \
+    ((HWY_ENABLED_BASELINE & HWY_SVE2) || HWY_HAVE_RUNTIME_DISPATCH)
+#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
+#else
+#define HWY_ATTAINABLE_SVE2 0
+#endif
+
 // Attainable means enabled and the compiler allows intrinsics (even when not
 // allowed to autovectorize). Used in 3 and 4.
 #if HWY_ARCH_X86
 #define HWY_ATTAINABLE_TARGETS                                        \
   HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
               HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
+#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
+#define HWY_ATTAINABLE_TARGETS                                      \
+  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_NEON | HWY_ATTAINABLE_SVE | \
+              HWY_ATTAINABLE_SVE2)
 #else
-#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
+#define HWY_ATTAINABLE_TARGETS \
+  (HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE | HWY_ATTAINABLE_SVE2)
 #endif
 
-// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
-// to ~HWY_SCALAR, but this is more explicit).
-#if defined(HWY_COMPILE_ONLY_SCALAR)
+// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
+#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
+#undef HWY_STATIC_TARGET
+#define HWY_STATIC_TARGET HWY_EMU128  // override baseline
+#define HWY_TARGETS HWY_EMU128
+
+// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
+// we currently still support it for backwards compatibility.
+#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
+    (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
 #undef HWY_STATIC_TARGET
 #define HWY_STATIC_TARGET HWY_SCALAR  // override baseline
 #define HWY_TARGETS HWY_SCALAR
@@ -379,7 +443,7 @@
 // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
 // excluding superseded targets, in particular scalar.
 #else
-#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
+#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2LL * HWY_STATIC_TARGET - 1LL))
 
 #endif  // target policy
 
diff -pruN 0.17.0-11/hwy/examples/benchmark.cc 1.0.0-2/hwy/examples/benchmark.cc
--- 0.17.0-11/hwy/examples/benchmark.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/examples/benchmark.cc	2022-07-27 11:48:16.000000000 +0000
@@ -13,10 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
-#include "hwy/foreach_target.h"
-
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -25,8 +21,12 @@
 #include <memory>
 #include <numeric>  // iota
 
-#include "hwy/aligned_allocator.h"
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
 // Must come after foreach_target.h to avoid redefinition errors.
+#include "hwy/aligned_allocator.h"
 #include "hwy/highway.h"
 #include "hwy/nanobenchmark.h"
 
@@ -82,7 +82,8 @@ void RunBenchmark(const char* caption) {
   benchmark.Verify(num_items);
 
   for (size_t i = 0; i < num_results; ++i) {
-    const double cycles_per_item = results[i].ticks / double(results[i].input);
+    const double cycles_per_item =
+        results[i].ticks / static_cast<double>(results[i].input);
     const double mad = results[i].variability * cycles_per_item;
     printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
            static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
@@ -234,7 +235,7 @@ namespace hwy {
 HWY_EXPORT(RunBenchmarks);
 
 void Run() {
-  for (uint32_t target : SupportedAndGeneratedTargets()) {
+  for (int64_t target : SupportedAndGeneratedTargets()) {
     SetSupportedTargetsForTest(target);
     HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
   }
diff -pruN 0.17.0-11/hwy/examples/skeleton.cc 1.0.0-2/hwy/examples/skeleton.cc
--- 0.17.0-11/hwy/examples/skeleton.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/examples/skeleton.cc	2022-07-27 11:48:16.000000000 +0000
@@ -23,7 +23,7 @@
 // __FILE__ is not reliable) so that foreach_target.h can re-include it.
 #define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
 // Generates code for each enabled target by re-including this source file.
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // Must come after foreach_target.h to avoid redefinition errors.
 #include "hwy/highway.h"
@@ -36,7 +36,7 @@ namespace skeleton {
 namespace HWY_NAMESPACE {
 
 // Highway ops reside here; ADL does not find templates nor builtins.
-using namespace hwy::HWY_NAMESPACE;
+namespace hn = hwy::HWY_NAMESPACE;
 
 // Computes log2 by converting to a vector of floats. Compiled once per target.
 template <class DF>
@@ -44,13 +44,13 @@ HWY_ATTR_NO_MSAN void OneFloorLog2(const
                                    const uint8_t* HWY_RESTRICT values,
                                    uint8_t* HWY_RESTRICT log2) {
   // Type tags for converting to other element types (Rebind = same count).
-  const RebindToSigned<DF> d32;
-  const Rebind<uint8_t, DF> d8;
+  const hn::RebindToSigned<DF> d32;
+  const hn::Rebind<uint8_t, DF> d8;
 
-  const auto u8 = Load(d8, values);
-  const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
-  const auto exponent = Sub(ShiftRight<23>(bits), Set(d32, 127));
-  Store(DemoteTo(d8, exponent), d8, log2);
+  const auto u8 = hn::Load(d8, values);
+  const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
+  const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
+  hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
 }
 
 void CodepathDemo() {
@@ -68,14 +68,14 @@ void FloorLog2(const uint8_t* HWY_RESTRI
                uint8_t* HWY_RESTRICT log2) {
   CodepathDemo();
 
-  const ScalableTag<float> df;
-  const size_t N = Lanes(df);
+  const hn::ScalableTag<float> df;
+  const size_t N = hn::Lanes(df);
   size_t i = 0;
   for (; i + N <= count; i += N) {
     OneFloorLog2(df, values + i, log2 + i);
   }
   for (; i < count; ++i) {
-    CappedTag<float, 1> d1;
+    hn::CappedTag<float, 1> d1;
     OneFloorLog2(d1, values + i, log2 + i);
   }
 }
diff -pruN 0.17.0-11/hwy/examples/skeleton_test.cc 1.0.0-2/hwy/examples/skeleton_test.cc
--- 0.17.0-11/hwy/examples/skeleton_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/examples/skeleton_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -21,7 +21,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // Must come after foreach_target.h to avoid redefinition errors.
 #include "hwy/highway.h"
@@ -35,13 +35,13 @@ HWY_BEFORE_NAMESPACE();
 namespace skeleton {
 namespace HWY_NAMESPACE {
 
-using namespace hwy::HWY_NAMESPACE;
+namespace hn = hwy::HWY_NAMESPACE;
 
 // Calls function defined in skeleton.cc.
 struct TestFloorLog2 {
   template <class T, class DF>
   HWY_NOINLINE void operator()(T /*unused*/, DF df) {
-    const size_t count = 5 * Lanes(df);
+    const size_t count = 5 * hn::Lanes(df);
     auto in = hwy::AllocateAligned<uint8_t>(count);
     auto expected = hwy::AllocateAligned<uint8_t>(count);
 
@@ -71,7 +71,7 @@ struct TestSumMulAdd {
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
     hwy::RandomState rng;
     const size_t count = 4096;
-    EXPECT_TRUE(count % Lanes(d) == 0);
+    EXPECT_EQ(0, count % hn::Lanes(d));
     auto mul = hwy::AllocateAligned<T>(count);
     auto x = hwy::AllocateAligned<T>(count);
     auto add = hwy::AllocateAligned<T>(count);
diff -pruN 0.17.0-11/hwy/foreach_target.h 1.0.0-2/hwy/foreach_target.h
--- 0.17.0-11/hwy/foreach_target.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/foreach_target.h	2022-07-27 11:48:16.000000000 +0000
@@ -119,6 +119,28 @@
 #endif
 #endif
 
+#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SVE_256
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SVE2_128
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
 #if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
 #undef HWY_TARGET
 #define HWY_TARGET HWY_SSSE3
@@ -174,9 +196,9 @@
 #endif
 #endif
 
-#if (HWY_TARGETS & HWY_WASM2) && (HWY_STATIC_TARGET != HWY_WASM2)
+#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256)
 #undef HWY_TARGET
-#define HWY_TARGET HWY_WASM2
+#define HWY_TARGET HWY_WASM_EMU256
 #include HWY_TARGET_INCLUDE
 #ifdef HWY_TARGET_TOGGLE
 #undef HWY_TARGET_TOGGLE
diff -pruN 0.17.0-11/hwy/highway.h 1.0.0-2/hwy/highway.h
--- 0.17.0-11/hwy/highway.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/highway.h	2022-07-27 11:48:16.000000000 +0000
@@ -27,8 +27,8 @@
 namespace hwy {
 
 // API version (https://semver.org/); keep in sync with CMakeLists.txt.
-#define HWY_MAJOR 0
-#define HWY_MINOR 17
+#define HWY_MAJOR 1
+#define HWY_MINOR 0
 #define HWY_PATCH 0
 
 //------------------------------------------------------------------------------
@@ -40,7 +40,7 @@ namespace hwy {
 // registers in the group, and is ignored on targets that do not support groups.
 #define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
 #define HWY_FULL2(T, LMUL) \
-  hwy::HWY_NAMESPACE::ScalableTag<T, CeilLog2(HWY_MAX(0, LMUL))>
+  hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
 #define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
 // Workaround for MSVC grouping __VA_ARGS__ into a single argument
 #define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
@@ -72,8 +72,8 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_RVV
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_WASM2
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM2::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_WASM
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_NEON
@@ -82,6 +82,10 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_SVE2
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE_256
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE2_128
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_PPC8
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_SSSE3
@@ -96,36 +100,6 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
 #endif
 
-// Dynamic dispatch declarations.
-
-template <typename RetType, typename... Args>
-struct FunctionCache {
- public:
-  typedef RetType(FunctionType)(Args...);
-
-  // A template function that when instantiated has the same signature as the
-  // function being called. This function initializes the global cache of the
-  // current supported targets mask used for dynamic dispatch and calls the
-  // appropriate function. Since this mask used for dynamic dispatch is a
-  // global cache, all the highway exported functions, even those exposed by
-  // different modules, will be initialized after this function runs for any one
-  // of those exported functions.
-  template <FunctionType* const table[]>
-  static RetType ChooseAndCall(Args... args) {
-    // If we are running here it means we need to update the chosen target.
-    ChosenTarget& chosen_target = GetChosenTarget();
-    chosen_target.Update();
-    return (table[chosen_target.GetIndex()])(args...);
-  }
-};
-
-// Factory function only used to infer the template parameters RetType and Args
-// from a function passed to the factory.
-template <typename RetType, typename... Args>
-FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
-  return FunctionCache<RetType, Args...>();
-}
-
 // HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
 // nullptr is that target was not compiled.
 #if HWY_TARGETS & HWY_EMU128
@@ -138,10 +112,10 @@ FunctionCache<RetType, Args...> Function
 #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
 #endif
 
-#if HWY_TARGETS & HWY_WASM2
-#define HWY_CHOOSE_WASM2(FUNC_NAME) &N_WASM2::FUNC_NAME
+#if HWY_TARGETS & HWY_WASM_EMU256
+#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
 #else
-#define HWY_CHOOSE_WASM2(FUNC_NAME) nullptr
+#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
 #endif
 
 #if HWY_TARGETS & HWY_WASM
@@ -174,6 +148,18 @@ FunctionCache<RetType, Args...> Function
 #define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
 #endif
 
+#if HWY_TARGETS & HWY_SVE_256
+#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SVE2_128
+#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
+#endif
+
 #if HWY_TARGETS & HWY_PPC8
 #define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
 #else
@@ -210,6 +196,53 @@ FunctionCache<RetType, Args...> Function
 #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
 #endif
 
+// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
+// apparently cannot be an array. Use a function pointer instead, which has the
+// disadvantage that we call the static (not best) target on the first call to
+// any HWY_DYNAMIC_DISPATCH.
+#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
+#define HWY_DISPATCH_WORKAROUND 1
+#else
+#define HWY_DISPATCH_WORKAROUND 0
+#endif
+
+// Provides a static member function which is what is called during the first
+// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
+// this function are the first entry in the tables created by HWY_EXPORT.
+template <typename RetType, typename... Args>
+struct FunctionCache {
+ public:
+  typedef RetType(FunctionType)(Args...);
+
+#if HWY_DISPATCH_WORKAROUND
+  template <FunctionType* const func>
+  static RetType ChooseAndCall(Args... args) {
+    ChosenTarget& chosen_target = GetChosenTarget();
+    chosen_target.Update(SupportedTargets());
+    return (*func)(args...);
+  }
+#else
+  // A template function that when instantiated has the same signature as the
+  // function being called. This function initializes the bit array of targets
+  // supported by the current CPU and then calls the appropriate entry within
+  // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
+  // exported functions, even those defined by different translation units,
+  // will dispatch directly to the best available target.
+  template <FunctionType* const table[]>
+  static RetType ChooseAndCall(Args... args) {
+    ChosenTarget& chosen_target = GetChosenTarget();
+    chosen_target.Update(SupportedTargets());
+    return (table[chosen_target.GetIndex()])(args...);
+  }
+#endif  // HWY_DISPATCH_WORKAROUND
+};
+
+// Used to deduce the template parameters RetType and Args from a function.
+template <typename RetType, typename... Args>
+FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
+  return FunctionCache<RetType, Args...>();
+}
+
 #define HWY_DISPATCH_TABLE(FUNC_NAME) \
   HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
 
@@ -218,7 +251,7 @@ FunctionCache<RetType, Args...> Function
 // static array must be defined at the same namespace level as the function
 // it is exporting.
 // After being exported, it can be called from other parts of the same source
-// file using HWY_DYNAMIC_DISTPATCH(), in particular from a function wrapper
+// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
 // like in the following example:
 //
 //   #include "hwy/highway.h"
@@ -248,14 +281,29 @@ FunctionCache<RetType, Args...> Function
 // This case still uses a table, although of a single element, to provide the
 // same compile error conditions as with the dynamic dispatch case when multiple
 // targets are being compiled.
-#define HWY_EXPORT(FUNC_NAME)                                       \
-  HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
-      const HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {                    \
-          &HWY_STATIC_DISPATCH(FUNC_NAME)}
+#define HWY_EXPORT(FUNC_NAME)                                             \
+  HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
+  HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
 
 #else
 
+// Simplified version for MSVC 2017: function pointer instead of table.
+#if HWY_DISPATCH_WORKAROUND
+
+#define HWY_EXPORT(FUNC_NAME)                                                \
+  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
+      FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
+      /* The first entry in the table initializes the global cache and       \
+       * calls the function from HWY_STATIC_TARGET. */                       \
+      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
+          FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>,      \
+      HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
+      HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
+  }
+
+#else
+
 // Dynamic dispatch case with one entry per dynamic target plus the fallback
 // target and the initialization wrapper.
 #define HWY_EXPORT(FUNC_NAME)                                                \
@@ -263,11 +311,14 @@ FunctionCache<RetType, Args...> Function
       FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
       /* The first entry in the table initializes the global cache and       \
        * calls the appropriate function. */                                  \
-      &decltype(hwy::FunctionCacheFactory(&HWY_STATIC_DISPATCH(              \
+      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
           FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>,        \
       HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
       HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
   }
+
+#endif  // HWY_DISPATCH_WORKAROUND
+
 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
   (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
 
@@ -305,9 +356,10 @@ FunctionCache<RetType, Args...> Function
 #error "PPC is not yet supported"
 #elif HWY_TARGET == HWY_NEON
 #include "hwy/ops/arm_neon-inl.h"
-#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
+    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
 #include "hwy/ops/arm_sve-inl.h"
-#elif HWY_TARGET == HWY_WASM2
+#elif HWY_TARGET == HWY_WASM_EMU256
 #include "hwy/ops/wasm_256-inl.h"
 #elif HWY_TARGET == HWY_WASM
 #include "hwy/ops/wasm_128-inl.h"
diff -pruN 0.17.0-11/hwy/highway_test.cc 1.0.0-2/hwy/highway_test.cc
--- 0.17.0-11/hwy/highway_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/highway_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -23,7 +23,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "highway_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"    // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/nanobenchmark.h"  // Unpredictable1
 #include "hwy/tests/test_util-inl.h"
diff -pruN 0.17.0-11/hwy/nanobenchmark.cc 1.0.0-2/hwy/nanobenchmark.cc
--- 0.17.0-11/hwy/nanobenchmark.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/nanobenchmark.cc	2022-07-27 11:48:16.000000000 +0000
@@ -25,7 +25,7 @@
 #include <algorithm>  // sort
 #include <array>
 #include <atomic>
-#include <chrono>
+#include <chrono>  //NOLINT
 #include <limits>
 #include <numeric>  // iota
 #include <random>
@@ -311,7 +311,8 @@ T MedianAbsoluteDeviation(const T* value
   std::vector<T> abs_deviations;
   abs_deviations.reserve(num_values);
   for (size_t i = 0; i < num_values; ++i) {
-    const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
+    const int64_t abs = std::abs(static_cast<int64_t>(values[i]) -
+                                 static_cast<int64_t>(median));
     abs_deviations.push_back(static_cast<T>(abs));
   }
   return Median(abs_deviations.data(), num_values);
@@ -425,7 +426,7 @@ std::string BrandString() {
 
 HWY_DLLEXPORT double InvariantTicksPerSecond() {
 #if HWY_ARCH_PPC && defined(__GLIBC__)
-  return double(__ppc_get_timebase_freq());
+  return static_cast<double>(__ppc_get_timebase_freq());
 #elif HWY_ARCH_X86 || HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC)
   // We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs.
   static const double freq = MeasureNominalClockRate();
@@ -433,12 +434,12 @@ HWY_DLLEXPORT double InvariantTicksPerSe
 #elif defined(_WIN32) || defined(_WIN64)
   LARGE_INTEGER freq;
   (void)QueryPerformanceFrequency(&freq);
-  return double(freq.QuadPart);
+  return static_cast<double>(freq.QuadPart);
 #elif defined(__APPLE__)
   // https://developer.apple.com/library/mac/qa/qa1398/_index.html
   mach_timebase_info_data_t timebase;
   (void)mach_timebase_info(&timebase);
-  return double(timebase.denom) / timebase.numer * 1E9;
+  return static_cast<double>(timebase.denom) / timebase.numer * 1E9;
 #else
   return 1E9;  // Haiku and clock_gettime return nanoseconds.
 #endif
diff -pruN 0.17.0-11/hwy/nanobenchmark_test.cc 1.0.0-2/hwy/nanobenchmark_test.cc
--- 0.17.0-11/hwy/nanobenchmark_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/nanobenchmark_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -58,7 +58,7 @@ std::mt19937 rng;
 // A function whose runtime depends on rng.
 FuncOutput Random(const void* /*arg*/, FuncInput in) {
   const size_t r = rng() & 0xF;
-  uint32_t ret = in;
+  FuncOutput ret = static_cast<FuncOutput>(in);
   for (size_t i = 0; i < r; ++i) {
     ret /= ((rng() & 1) + 2);
   }
diff -pruN 0.17.0-11/hwy/ops/arm_neon-inl.h 1.0.0-2/hwy/ops/arm_neon-inl.h
--- 0.17.0-11/hwy/ops/arm_neon-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/arm_neon-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -19,11 +19,16 @@
 // ARM NEON intrinsics are documented at:
 // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
 
-#include <arm_neon.h>
 #include <stddef.h>
 #include <stdint.h>
 
-#include "hwy/base.h"
+#include "hwy/base.h"  // before HWY_DIAGNOSTICS
+
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+#include <arm_neon.h>
+HWY_DIAGNOSTICS(pop)
+
 #include "hwy/ops/shared-inl.h"
 
 HWY_BEFORE_NAMESPACE();
@@ -814,6 +819,9 @@ class Mask128 {
   Raw raw;
 };
 
+template <typename T>
+using Mask64 = Mask128<T, 8 / sizeof(T)>;
+
 namespace detail {
 
 // Deduce Simd<T, N, 0> from Vec128<T, N>
@@ -2625,61 +2633,49 @@ HWY_API Vec64<double> LoadU(Full64<doubl
   return Vec64<double>(vld1_f64(p));
 }
 #endif
-
 // ------------------------------ Load 32
 
-HWY_API Vec32<uint8_t> LoadU(Full32<uint8_t> /*tag*/,
-                             const uint8_t* HWY_RESTRICT p) {
-  uint32x2_t a = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
-  return Vec32<uint8_t>(vreinterpret_u8_u32(a));
-}
-HWY_API Vec32<uint16_t> LoadU(Full32<uint16_t> /*tag*/,
-                              const uint16_t* HWY_RESTRICT p) {
-  uint32x2_t a = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
-  return Vec32<uint16_t>(vreinterpret_u16_u32(a));
-}
+// Actual 32-bit broadcast load - used to implement the other lane types
+// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
 HWY_API Vec32<uint32_t> LoadU(Full32<uint32_t> /*tag*/,
                               const uint32_t* HWY_RESTRICT p) {
-  return Vec32<uint32_t>(vld1_dup_u32(reinterpret_cast<const uint32_t*>(p)));
-}
-HWY_API Vec32<int8_t> LoadU(Full32<int8_t> /*tag*/,
-                            const int8_t* HWY_RESTRICT p) {
-  int32x2_t a = vld1_dup_s32(reinterpret_cast<const int32_t*>(p));
-  return Vec32<int8_t>(vreinterpret_s8_s32(a));
-}
-HWY_API Vec32<int16_t> LoadU(Full32<int16_t> /*tag*/,
-                             const int16_t* HWY_RESTRICT p) {
-  int32x2_t a = vld1_dup_s32(reinterpret_cast<const int32_t*>(p));
-  return Vec32<int16_t>(vreinterpret_s16_s32(a));
+  return Vec32<uint32_t>(vld1_dup_u32(p));
 }
 HWY_API Vec32<int32_t> LoadU(Full32<int32_t> /*tag*/,
                              const int32_t* HWY_RESTRICT p) {
-  return Vec32<int32_t>(vld1_dup_s32(reinterpret_cast<const int32_t*>(p)));
+  return Vec32<int32_t>(vld1_dup_s32(p));
 }
 HWY_API Vec32<float> LoadU(Full32<float> /*tag*/, const float* HWY_RESTRICT p) {
   return Vec32<float>(vld1_dup_f32(p));
 }
 
+template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
+HWY_API Vec32<T> LoadU(Full32<T> d, const T* HWY_RESTRICT p) {
+  const Repartition<uint32_t, decltype(d)> d32;
+  uint32_t buf;
+  CopyBytes<4>(p, &buf);
+  return BitCast(d, LoadU(d32, &buf));
+}
+
 // ------------------------------ Load 16
 
-HWY_API Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2, 0> /*tag*/,
-                                 const uint8_t* HWY_RESTRICT p) {
-  uint16x4_t a = vld1_dup_u16(reinterpret_cast<const uint16_t*>(p));
-  return Vec128<uint8_t, 2>(vreinterpret_u8_u16(a));
-}
+// Actual 16-bit broadcast load - used to implement the other lane types
+// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
 HWY_API Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1, 0> /*tag*/,
                                   const uint16_t* HWY_RESTRICT p) {
-  return Vec128<uint16_t, 1>(
-      vld1_dup_u16(reinterpret_cast<const uint16_t*>(p)));
-}
-HWY_API Vec128<int8_t, 2> LoadU(Simd<int8_t, 2, 0> /*tag*/,
-                                const int8_t* HWY_RESTRICT p) {
-  int16x4_t a = vld1_dup_s16(reinterpret_cast<const int16_t*>(p));
-  return Vec128<int8_t, 2>(vreinterpret_s8_s16(a));
+  return Vec128<uint16_t, 1>(vld1_dup_u16(p));
 }
 HWY_API Vec128<int16_t, 1> LoadU(Simd<int16_t, 1, 0> /*tag*/,
                                  const int16_t* HWY_RESTRICT p) {
-  return Vec128<int16_t, 1>(vld1_dup_s16(reinterpret_cast<const int16_t*>(p)));
+  return Vec128<int16_t, 1>(vld1_dup_s16(p));
+}
+
+template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
+HWY_API Vec128<T, 2> LoadU(Simd<T, 2, 0> d, const T* HWY_RESTRICT p) {
+  const Repartition<uint16_t, decltype(d)> d16;
+  uint16_t buf;
+  CopyBytes<2>(p, &buf);
+  return BitCast(d, LoadU(d16, &buf));
 }
 
 // ------------------------------ Load 8
@@ -2821,30 +2817,10 @@ HWY_API void StoreU(const Vec64<double>
 
 // ------------------------------ Store 32
 
-HWY_API void StoreU(const Vec32<uint8_t> v, Full32<uint8_t>,
-                    uint8_t* HWY_RESTRICT p) {
-  uint32x2_t a = vreinterpret_u32_u8(v.raw);
-  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
-}
-HWY_API void StoreU(const Vec32<uint16_t> v, Full32<uint16_t>,
-                    uint16_t* HWY_RESTRICT p) {
-  uint32x2_t a = vreinterpret_u32_u16(v.raw);
-  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
-}
 HWY_API void StoreU(const Vec32<uint32_t> v, Full32<uint32_t>,
                     uint32_t* HWY_RESTRICT p) {
   vst1_lane_u32(p, v.raw, 0);
 }
-HWY_API void StoreU(const Vec32<int8_t> v, Full32<int8_t>,
-                    int8_t* HWY_RESTRICT p) {
-  int32x2_t a = vreinterpret_s32_s8(v.raw);
-  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
-}
-HWY_API void StoreU(const Vec32<int16_t> v, Full32<int16_t>,
-                    int16_t* HWY_RESTRICT p) {
-  int32x2_t a = vreinterpret_s32_s16(v.raw);
-  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
-}
 HWY_API void StoreU(const Vec32<int32_t> v, Full32<int32_t>,
                     int32_t* HWY_RESTRICT p) {
   vst1_lane_s32(p, v.raw, 0);
@@ -2854,27 +2830,31 @@ HWY_API void StoreU(const Vec32<float> v
   vst1_lane_f32(p, v.raw, 0);
 }
 
+template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
+HWY_API void StoreU(const Vec32<T> v, Full32<T> d, T* HWY_RESTRICT p) {
+  const Repartition<uint32_t, decltype(d)> d32;
+  const uint32_t buf = GetLane(BitCast(d32, v));
+  CopyBytes<4>(&buf, p);
+}
+
 // ------------------------------ Store 16
 
-HWY_API void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2, 0>,
-                    uint8_t* HWY_RESTRICT p) {
-  uint16x4_t a = vreinterpret_u16_u8(v.raw);
-  vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
-}
 HWY_API void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1, 0>,
                     uint16_t* HWY_RESTRICT p) {
   vst1_lane_u16(p, v.raw, 0);
 }
-HWY_API void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2, 0>,
-                    int8_t* HWY_RESTRICT p) {
-  int16x4_t a = vreinterpret_s16_s8(v.raw);
-  vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
-}
 HWY_API void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1, 0>,
                     int16_t* HWY_RESTRICT p) {
   vst1_lane_s16(p, v.raw, 0);
 }
 
+template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
+HWY_API void StoreU(const Vec128<T, 2> v, Simd<T, 2, 0> d, T* HWY_RESTRICT p) {
+  const Repartition<uint16_t, decltype(d)> d16;
+  const uint16_t buf = GetLane(BitCast(d16, v));
+  CopyBytes<2>(&buf, p);
+}
+
 // ------------------------------ Store 8
 
 HWY_API void StoreU(const Vec128<uint8_t, 1> v, Simd<uint8_t, 1, 0>,
@@ -4587,7 +4567,8 @@ HWY_API Vec128<bfloat16_t, 2 * N> Reorde
 
 // ================================================== CRYPTO
 
-#if defined(__ARM_FEATURE_AES)
+#if defined(__ARM_FEATURE_AES) || \
+    (HWY_HAVE_RUNTIME_DISPATCH && HWY_ARCH_ARM_A64)
 
 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
 #ifdef HWY_NATIVE_AES
@@ -4632,6 +4613,73 @@ HWY_API Vec128<float, N> PromoteTo(Simd<
   return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
 }
 
+// ------------------------------ Truncations
+
+template <typename From, typename To, HWY_IF_UNSIGNED(From),
+          HWY_IF_UNSIGNED(To),
+          hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
+HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
+                                 const Vec128<From, 1> v) {
+  const Repartition<To, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  return Vec128<To, 1>{v1.raw};
+}
+
+HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
+                                      const Vec128<uint64_t, 2> v) {
+  const Repartition<uint8_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  const auto v3 = detail::ConcatEven(v2, v2);
+  const auto v4 = detail::ConcatEven(v3, v3);
+  return LowerHalf(LowerHalf(LowerHalf(v4)));
+}
+
+HWY_API Vec32<uint16_t> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
+                                   const Vec128<uint64_t, 2> v) {
+  const Repartition<uint16_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  const auto v3 = detail::ConcatEven(v2, v2);
+  return LowerHalf(LowerHalf(v3));
+}
+
+HWY_API Vec64<uint32_t> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
+                                   const Vec128<uint64_t, 2> v) {
+  const Repartition<uint32_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  return LowerHalf(v2);
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint32_t, N> v) {
+  const Repartition<uint8_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  const auto v3 = detail::ConcatEven(v2, v2);
+  return LowerHalf(LowerHalf(v3));
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+                                       const Vec128<uint32_t, N> v) {
+  const Repartition<uint16_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  return LowerHalf(v2);
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint16_t, N> v) {
+  const Repartition<uint8_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  return LowerHalf(v2);
+}
+
 // ------------------------------ MulEven (ConcatEven)
 
 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
@@ -5038,6 +5086,34 @@ HWY_API Mask128<T, N> LoadMaskBits(Simd<
 
 namespace detail {
 
+// Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than
+// BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse.
+template <typename T>
+HWY_INLINE uint64_t NibblesFromMask(const Full128<T> d, Mask128<T> mask) {
+  const Full128<uint16_t> du16;
+  const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask));
+  const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4));
+  return GetLane(BitCast(Full64<uint64_t>(), nib));
+}
+
+template <typename T>
+HWY_INLINE uint64_t NibblesFromMask(const Full64<T> d, Mask64<T> mask) {
+  // There is no vshrn_n_u16 for uint16x4, so zero-extend.
+  const Twice<decltype(d)> d2;
+  const Vec128<T> v128 = ZeroExtendVector(d2, VecFromMask(d, mask));
+  // No need to mask, upper half is zero thanks to ZeroExtendVector.
+  return NibblesFromMask(d2, MaskFromVec(v128));
+}
+
+template <typename T, size_t N, HWY_IF_LE32(T, N)>
+HWY_INLINE uint64_t NibblesFromMask(Simd<T, N, 0> /*d*/, Mask128<T, N> mask) {
+  const Mask64<T> mask64(mask.raw);
+  const uint64_t nib = NibblesFromMask(Full64<T>(), mask64);
+  // Clear nibbles from upper half of 64-bits
+  constexpr size_t kBytes = sizeof(T) * N;
+  return nib & ((1ull << (kBytes * 4)) - 1);
+}
+
 template <typename T>
 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
                                  const Mask128<T> mask) {
@@ -5195,6 +5271,10 @@ HWY_INLINE uint64_t BitsFromMask(const M
 // Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
 // ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
 // changes each lane to 1 (if mask set) or 0.
+// NOTE: PopCount also operates on vectors, so we still have to do horizontal
+// sums separately. We specialize CountTrue for full vectors (negating instead
+// of PopCount because it avoids an extra shift), and use PopCount of
+// NibblesFromMask for partial vectors.
 
 template <typename T>
 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) {
@@ -5265,15 +5345,17 @@ HWY_API size_t CountTrue(Full128<T> /* t
 
 // Partial
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  return PopCount(detail::BitsFromMask(mask));
+HWY_API size_t CountTrue(Simd<T, N, 0> d, const Mask128<T, N> mask) {
+  constexpr int kDiv = 4 * sizeof(T);
+  return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
 }
-
 template <typename T, size_t N>
-HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
+HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> d,
                                const Mask128<T, N> mask) {
-  const uint64_t bits = detail::BitsFromMask(mask);
-  return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
+  const uint64_t nib = detail::NibblesFromMask(d, mask);
+  if (nib == 0) return -1;
+  constexpr int kDiv = 4 * sizeof(T);
+  return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv);
 }
 
 // `p` points to at least 8 writable bytes.
@@ -5286,29 +5368,21 @@ HWY_API size_t StoreMaskBits(Simd<T, N,
   return kNumBytes;
 }
 
+template <typename T, size_t N>
+HWY_API bool AllFalse(const Simd<T, N, 0> d, const Mask128<T, N> m) {
+  return detail::NibblesFromMask(d, m) == 0;
+}
+
 // Full
 template <typename T>
-HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
-#if HWY_ARCH_ARM_A64
-  const Full128<uint32_t> d32;
-  const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(d, m)));
-  return (vmaxvq_u32(m32.raw) == 0);
-#else
-  const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(d, m));
-  uint32x2_t a = vqmovn_u64(v64.raw);
-  return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
-#endif
+HWY_API bool AllTrue(const Full128<T> d, const Mask128<T> m) {
+  return detail::NibblesFromMask(d, m) == ~0ull;
 }
-
 // Partial
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> m) {
-  return detail::BitsFromMask(m) == 0;
-}
-
-template <typename T, size_t N>
 HWY_API bool AllTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
-  return AllFalse(d, VecFromMask(d, m) == Zero(d));
+  constexpr size_t kBytes = sizeof(T) * N;
+  return detail::NibblesFromMask(d, m) == (1ull << (kBytes * 4)) - 1;
 }
 
 // ------------------------------ Compress
@@ -5351,6 +5425,7 @@ HWY_INLINE Vec128<T, N> IdxFromBits(hwy:
   // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
   // is likely more costly than the higher cache footprint from storing bytes.
   alignas(16) constexpr uint8_t table[256 * 8] = {
+      // PrintCompress16x8Tables
       0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
@@ -5486,12 +5561,165 @@ HWY_INLINE Vec128<T, N> IdxFromBits(hwy:
 }
 
 template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<2> /*tag*/,
+                                       const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 256);
+  const Simd<T, N, 0> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  const Simd<uint16_t, N, 0> du;
+
+  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
+  // indices for VTBL (one vector's worth for each of 256 combinations of
+  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
+  // store lane indices and convert to byte indices (2*lane + 0..1), with the
+  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
+  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
+  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
+  // is likely more costly than the higher cache footprint from storing bytes.
+  alignas(16) constexpr uint8_t table[256 * 8] = {
+      // PrintCompressNot16x8Tables
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
+      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
+      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
+      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
+      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
+      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
+      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
+      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
+      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
+      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
+      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
+      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
+      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
+      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
+      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
+      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
+      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
+      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
+      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
+      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
+      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
+      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
+      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
+      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
+      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
+      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
+      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
+      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
+      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
+      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
+      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
+      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
+      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
+      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
+      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
+      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
+      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
+      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
+      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
+      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
+      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
+      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
+      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
+      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
+      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
+      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
+      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
+      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
+      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
+      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
+      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
+      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
+      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
+      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
+      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
+      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
+      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
+      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
+      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
+      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
+      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
+      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
+      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
+      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
+      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
+      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
+      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
+      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
+      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
+      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
+      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
+      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
+      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
+      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
+      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
+      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
+      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
+      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
+      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
+      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
+      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
+      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
+      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
+      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
+      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
+      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
+      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
+      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
+      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
+      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
+      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
+      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
+      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
+      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
+      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
+      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
+      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
+      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
+      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
+      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
+      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
+      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
+      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
+      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
+      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
+      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
+      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
+      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
+      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
+      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
+      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
+      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
+      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
+      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
+      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
+      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
+      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
+      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
+      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
+      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
+      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
+
+  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
+  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+  return BitCast(d, pairs + Set(du, 0x0100));
+}
+
+template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
                                     const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);
 
   // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[16 * 16] = {
+  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
+      // PrintCompress32x4Tables
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
@@ -5510,7 +5738,35 @@ HWY_INLINE Vec128<T, N> IdxFromBits(hwy:
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<4> /*tag*/,
+                                       const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 16);
+
+  // There are only 4 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
+      // PrintCompressNot32x4Tables
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
+      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
+      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+      12, 13, 14, 15};
+  const Simd<T, N, 0> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
@@ -5521,15 +5777,34 @@ HWY_INLINE Vec128<T, N> IdxFromBits(hwy:
   HWY_DASSERT(mask_bits < 4);
 
   // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[4 * 16] = {
+  alignas(16) constexpr uint8_t u8_indices[64] = {
+      // PrintCompress64x2Tables
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
+
+  const Simd<T, N, 0> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/,
+                                       const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 4);
+
+  // There are only 2 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
+      // PrintCompressNot64x2Tables
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
 
   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
 #endif
@@ -5545,13 +5820,76 @@ HWY_INLINE Vec128<T, N> Compress(Vec128<
   return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }
 
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
+  const auto idx =
+      detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
+  using D = Simd<T, N, 0>;
+  const RebindToSigned<D> di;
+  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+}
+
 }  // namespace detail
 
-template <typename T, size_t N>
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
+  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
+  const Simd<T, N, 0> d;
+  const Vec128<T, N> m = VecFromMask(d, mask);
+  const Vec128<T, N> maskL = DupEven(m);
+  const Vec128<T, N> maskH = DupOdd(m);
+  const Vec128<T, N> swap = AndNot(maskL, maskH);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
   return detail::Compress(v, detail::BitsFromMask(mask));
 }
 
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
+  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
+  const Full128<T> d;
+  const Vec128<T> m = VecFromMask(d, mask);
+  const Vec128<T> maskL = DupEven(m);
+  const Vec128<T> maskH = DupOdd(m);
+  const Vec128<T> swap = AndNot(maskH, maskL);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
+  // For partial vectors, we cannot pull the Not() into the table because
+  // BitsFromMask clears the upper bits.
+  if (N < 16 / sizeof(T)) {
+    return detail::Compress(v, detail::BitsFromMask(Not(mask)));
+  }
+  return detail::CompressNot(v, detail::BitsFromMask(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+                                           Mask128<uint64_t> /* m */) {
+  return v;
+}
+
 // ------------------------------ CompressBits
 
 template <typename T, size_t N>
@@ -5943,15 +6281,6 @@ HWY_API void StoreInterleaved4(const Vec
 
 // ------------------------------ Lt128
 
-namespace detail {
-
-template <size_t kLanes, typename T, size_t N>
-Mask128<T, N> ShiftMaskLeft(Mask128<T, N> m) {
-  return MaskFromVec(ShiftLeftLanes<kLanes>(VecFromMask(Simd<T, N, 0>(), m)));
-}
-
-}  // namespace detail
-
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
                                Vec128<T, N> b) {
@@ -5970,14 +6299,21 @@ HWY_INLINE Mask128<T, N> Lt128(Simd<T, N
   //  1  0  0  1  |  1
   //  1  1  0  0  |  0
   const Mask128<T, N> eqHL = Eq(a, b);
-  const Mask128<T, N> ltHL = Lt(a, b);
+  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
   // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
   // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
-  // comparison result leftwards requires only 4.
-  const Mask128<T, N> ltLx = detail::ShiftMaskLeft<1>(ltHL);
-  const Mask128<T, N> outHx = Or(ltHL, And(eqHL, ltLx));
-  const Vec128<T, N> vecHx = VecFromMask(d, outHx);
-  return MaskFromVec(InterleaveUpper(d, vecHx, vecHx));
+  // comparison result leftwards requires only 4. IfThenElse compiles to the
+  // same code as OrAnd().
+  const Vec128<T, N> ltLx = DupEven(ltHL);
+  const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
+  return MaskFromVec(DupOdd(outHx));
+}
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+                                    Vec128<T, N> b) {
+  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
+  return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
 }
 
 // ------------------------------ Min128, Max128 (Lt128)
@@ -5990,7 +6326,17 @@ HWY_INLINE VFromD<D> Min128(D d, const V
 
 template <class D>
 HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128(d, a, b), b, a);
+  return IfThenElse(Lt128(d, b, a), a, b);
+}
+
+template <class D>
+HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, a, b), a, b);
+}
+
+template <class D>
+HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, b, a), a, b);
 }
 
 // ================================================== Operator wrapper
diff -pruN 0.17.0-11/hwy/ops/arm_sve-inl.h 1.0.0-2/hwy/ops/arm_sve-inl.h
--- 0.17.0-11/hwy/ops/arm_sve-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/arm_sve-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -24,11 +24,13 @@
 #include "hwy/ops/shared-inl.h"
 
 // If running on hardware whose vector length is known to be a power of two, we
-// can skip fixups for non-power of two sizes. This may be 1 on future
-// fixed-size SVE targets.
-#ifndef HWY_SVE_IS_POW2
+// can skip fixups for non-power of two sizes.
+#undef HWY_SVE_IS_POW2
+#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
+#define HWY_SVE_IS_POW2 1
+#else
 #define HWY_SVE_IS_POW2 0
-#endif  // HWY_SVE_IS_POW2
+#endif
 
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
@@ -42,11 +44,6 @@ using DFromV = typename DFromV_t<RemoveC
 template <class V>
 using TFromV = TFromD<DFromV<V>>;
 
-#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
-#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
-#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
-#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
-
 // ================================================== MACROS
 
 // Generate specializations and function definitions using X macros. Although
@@ -202,31 +199,57 @@ HWY_INLINE size_t AllHardwareLanes(hwy::
   return svcntd_pat(SV_ALL);
 }
 
+// All-true mask from a macro
+#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL)
+
+#if HWY_SVE_IS_POW2
+#define HWY_SVE_PTRUE(BITS) HWY_SVE_ALL_PTRUE(BITS)
+#else
+#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
+
 // Returns actual lanes of a hardware vector, rounded down to a power of two.
-HWY_INLINE size_t HardwareLanes(hwy::SizeTag<1> /* tag */) {
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_INLINE size_t HardwareLanes() {
   return svcntb_pat(SV_POW2);
 }
-HWY_INLINE size_t HardwareLanes(hwy::SizeTag<2> /* tag */) {
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE size_t HardwareLanes() {
   return svcnth_pat(SV_POW2);
 }
-HWY_INLINE size_t HardwareLanes(hwy::SizeTag<4> /* tag */) {
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE size_t HardwareLanes() {
   return svcntw_pat(SV_POW2);
 }
-HWY_INLINE size_t HardwareLanes(hwy::SizeTag<8> /* tag */) {
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE size_t HardwareLanes() {
   return svcntd_pat(SV_POW2);
 }
 
+#endif  // HWY_SVE_IS_POW2
+
 }  // namespace detail
 
 // Returns actual number of lanes after capping by N and shifting. May return 0
 // (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8).
+#if HWY_TARGET == HWY_SVE_256
+template <typename T, size_t N, int kPow2>
+HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
+  return HWY_MIN(detail::ScaleByPower(32 / sizeof(T), kPow2), N);
+}
+#elif HWY_TARGET == HWY_SVE2_128
+template <typename T, size_t N, int kPow2>
+HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
+  return HWY_MIN(detail::ScaleByPower(16 / sizeof(T), kPow2), N);
+}
+#else
 template <typename T, size_t N, int kPow2>
 HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
-  const size_t actual = detail::HardwareLanes(hwy::SizeTag<sizeof(T)>());
+  const size_t actual = detail::HardwareLanes<T>();
   // Common case of full vectors: avoid any extra instructions.
   if (detail::IsFull(d)) return actual;
   return HWY_MIN(detail::ScaleByPower(actual, kPow2), N);
 }
+#endif  // HWY_TARGET
 
 // ================================================== MASK INIT
 
@@ -244,13 +267,14 @@ HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN,
 
 namespace detail {
 
-// All-true mask from a macro
-#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
-
-#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)       \
-  template <size_t N, int kPow2>                                   \
-  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
-    return HWY_SVE_PTRUE(BITS);                                    \
+#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)            \
+  template <size_t N, int kPow2>                                        \
+  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) {      \
+    return HWY_SVE_PTRUE(BITS);                                         \
+  }                                                                     \
+  template <size_t N, int kPow2>                                        \
+  HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
+    return HWY_SVE_ALL_PTRUE(BITS);                                     \
   }
 
 HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue)  // return all-true
@@ -757,14 +781,14 @@ HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, If
 #undef HWY_SVE_IF_THEN_ELSE
 
 // ------------------------------ IfThenElseZero
-template <class M, class V>
-HWY_API V IfThenElseZero(const M mask, const V yes) {
+template <class V>
+HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
   return IfThenElse(mask, yes, Zero(DFromV<V>()));
 }
 
 // ------------------------------ IfThenZeroElse
-template <class M, class V>
-HWY_API V IfThenZeroElse(const M mask, const V no) {
+template <class V>
+HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
   return IfThenElse(mask, Zero(DFromV<V>()), no);
 }
 
@@ -827,26 +851,45 @@ HWY_API svbool_t MaskFromVec(const V v)
 }
 
 // ------------------------------ VecFromMask
-
-template <class D, HWY_IF_NOT_FLOAT_D(D)>
+template <class D>
 HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
-  const auto v0 = Zero(RebindToSigned<decltype(d)>());
-  return BitCast(d, detail::SubN(mask, v0, 1));
+  const RebindToSigned<D> di;
+  // This generates MOV imm, whereas svdup_n_s8_z generates MOV scalar, which
+  // requires an extra instruction plus M0 pipeline.
+  return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
 }
 
-template <class D, HWY_IF_FLOAT_D(D)>
-HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
-  return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
+// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
+
+#if HWY_TARGET == HWY_SVE2
+
+#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP)          \
+  HWY_API HWY_SVE_V(BASE, BITS)                                   \
+      NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \
+           HWY_SVE_V(BASE, BITS) no) {                            \
+    return sv##OP##_##CHAR##BITS(yes, no, mask);                  \
+  }
+
+HWY_SVE_FOREACH_UI(HWY_SVE_IF_VEC, IfVecThenElse, bsl)
+#undef HWY_SVE_IF_VEC
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
+  const DFromV<V> d;
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(
+      d, IfVecThenElse(BitCast(du, mask), BitCast(du, yes), BitCast(du, no)));
 }
 
-// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
+#else
 
 template <class V>
 HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
-  // TODO(janwas): use svbsl for SVE2
-  return IfThenElse(MaskFromVec(mask), yes, no);
+  return Or(And(mask, yes), AndNot(mask, no));
 }
 
+#endif  // HWY_TARGET == HWY_SVE2
+
 // ------------------------------ Floating-point classification (Ne)
 
 template <class V>
@@ -1171,17 +1214,15 @@ HWY_API svint32_t PromoteTo(Simd<int32_t
 
 // ------------------------------ PromoteTo F
 
-// svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
-// first replicate each lane once.
 namespace detail {
 HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLower, zip1)
-// Do not use zip2 to implement PromoteUpperTo or similar because vectors may be
-// non-powers of two, so getting the actual "upper half" requires MaskUpperHalf.
 }  // namespace detail
 
 template <size_t N, int kPow2>
 HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */,
                               const svfloat16_t v) {
+  // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
+  // first replicate each lane once.
   const svfloat16_t vv = detail::ZipLower(v, v);
   return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
 }
@@ -1281,6 +1322,60 @@ HWY_API svuint8_t U8FromU32(const svuint
   return svuzp1_u8(cast8, cast8);
 }
 
+// ------------------------------ Truncations
+
+template <size_t N, int kPow2>
+HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */,
+                             const svuint64_t v) {
+  const DFromV<svuint8_t> d;
+  const svuint8_t v1 = BitCast(d, v);
+  const svuint8_t v2 = svuzp1_u8(v1, v1);
+  const svuint8_t v3 = svuzp1_u8(v2, v2);
+  return svuzp1_u8(v3, v3);
+}
+
+template <size_t N, int kPow2>
+HWY_API svuint16_t TruncateTo(Simd<uint16_t, N, kPow2> /* tag */,
+                              const svuint64_t v) {
+  const DFromV<svuint16_t> d;
+  const svuint16_t v1 = BitCast(d, v);
+  const svuint16_t v2 = svuzp1_u16(v1, v1);
+  return svuzp1_u16(v2, v2);
+}
+
+template <size_t N, int kPow2>
+HWY_API svuint32_t TruncateTo(Simd<uint32_t, N, kPow2> /* tag */,
+                              const svuint64_t v) {
+  const DFromV<svuint32_t> d;
+  const svuint32_t v1 = BitCast(d, v);
+  return svuzp1_u32(v1, v1);
+}
+
+template <size_t N, int kPow2>
+HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */,
+                             const svuint32_t v) {
+  const DFromV<svuint8_t> d;
+  const svuint8_t v1 = BitCast(d, v);
+  const svuint8_t v2 = svuzp1_u8(v1, v1);
+  return svuzp1_u8(v2, v2);
+}
+
+template <size_t N, int kPow2>
+HWY_API svuint16_t TruncateTo(Simd<uint16_t, N, kPow2> /* tag */,
+                              const svuint32_t v) {
+  const DFromV<svuint16_t> d;
+  const svuint16_t v1 = BitCast(d, v);
+  return svuzp1_u16(v1, v1);
+}
+
+template <size_t N, int kPow2>
+HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */,
+                             const svuint16_t v) {
+  const DFromV<svuint8_t> d;
+  const svuint8_t v1 = BitCast(d, v);
+  return svuzp1_u8(v1, v1);
+}
+
 // ------------------------------ DemoteTo I
 
 template <size_t N, int kPow2>
@@ -1331,6 +1426,10 @@ namespace detail {
   }
 HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEven, uzp1)
 HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOdd, uzp2)
+#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
+HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
+HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
+#endif
 #undef HWY_SVE_CONCAT_EVERY_SECOND
 
 // Used to slide up / shift whole register left; mask indicates which range
@@ -1439,16 +1538,176 @@ HWY_API VFromD<D> Iota(const D d, TFromD
   return detail::AddN(ConvertTo(d, Iota(di, 0)), first);
 }
 
+// ------------------------------ InterleaveLower
+
+template <class D, class V>
+HWY_API V InterleaveLower(D d, const V a, const V b) {
+  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
+#if HWY_TARGET == HWY_SVE2_128
+  (void)d;
+  return detail::ZipLower(a, b);
+#else
+  // Move lower halves of blocks to lower half of vector.
+  const Repartition<uint64_t, decltype(d)> d64;
+  const auto a64 = BitCast(d64, a);
+  const auto b64 = BitCast(d64, b);
+  const auto a_blocks = detail::ConcatEven(a64, a64);  // only lower half needed
+  const auto b_blocks = detail::ConcatEven(b64, b64);
+  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
+#endif
+}
+
+template <class V>
+HWY_API V InterleaveLower(const V a, const V b) {
+  return InterleaveLower(DFromV<V>(), a, b);
+}
+
+// ------------------------------ InterleaveUpper
+
+// Only use zip2 if vector are a powers of two, otherwise getting the actual
+// "upper half" requires MaskUpperHalf.
+#if HWY_TARGET == HWY_SVE2_128
+namespace detail {
+HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpper, zip2)
+}  // namespace detail
+#endif
+
+// Full vector: guaranteed to have at least one block
+template <class D, class V = VFromD<D>,
+          hwy::EnableIf<detail::IsFull(D())>* = nullptr>
+HWY_API V InterleaveUpper(D d, const V a, const V b) {
+#if HWY_TARGET == HWY_SVE2_128
+  (void)d;
+  return detail::ZipUpper(a, b);
+#else
+  // Move upper halves of blocks to lower half of vector.
+  const Repartition<uint64_t, decltype(d)> d64;
+  const auto a64 = BitCast(d64, a);
+  const auto b64 = BitCast(d64, b);
+  const auto a_blocks = detail::ConcatOdd(a64, a64);  // only lower half needed
+  const auto b_blocks = detail::ConcatOdd(b64, b64);
+  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
+#endif
+}
+
+// Capped/fraction: need runtime check
+template <class D, class V = VFromD<D>,
+          hwy::EnableIf<!detail::IsFull(D())>* = nullptr>
+HWY_API V InterleaveUpper(D d, const V a, const V b) {
+  // Less than one block: treat as capped
+  if (Lanes(d) * sizeof(TFromD<D>) < 16) {
+    const Half<decltype(d)> d2;
+    return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
+  }
+  return InterleaveUpper(DFromV<V>(), a, b);
+}
+
 // ================================================== COMBINE
 
 namespace detail {
 
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
+template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 32:
+      return svptrue_pat_b8(SV_VL16);
+    case 16:
+      return svptrue_pat_b8(SV_VL8);
+    case 8:
+      return svptrue_pat_b8(SV_VL4);
+    case 4:
+      return svptrue_pat_b8(SV_VL2);
+    default:
+      return svptrue_pat_b8(SV_VL1);
+  }
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 16:
+      return svptrue_pat_b16(SV_VL8);
+    case 8:
+      return svptrue_pat_b16(SV_VL4);
+    case 4:
+      return svptrue_pat_b16(SV_VL2);
+    default:
+      return svptrue_pat_b16(SV_VL1);
+  }
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 8:
+      return svptrue_pat_b32(SV_VL4);
+    case 4:
+      return svptrue_pat_b32(SV_VL2);
+    default:
+      return svptrue_pat_b32(SV_VL1);
+  }
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 4:
+      return svptrue_pat_b64(SV_VL2);
+    default:
+      return svptrue_pat_b64(SV_VL1);
+  }
+}
+#endif
+#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
+template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 16:
+      return svptrue_pat_b8(SV_VL8);
+    case 8:
+      return svptrue_pat_b8(SV_VL4);
+    case 4:
+      return svptrue_pat_b8(SV_VL2);
+    case 2:
+    case 1:
+    default:
+      return svptrue_pat_b8(SV_VL1);
+  }
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 8:
+      return svptrue_pat_b16(SV_VL4);
+    case 4:
+      return svptrue_pat_b16(SV_VL2);
+    case 2:
+    case 1:
+    default:
+      return svptrue_pat_b16(SV_VL1);
+  }
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
+svbool_t MaskLowerHalf(D d) {
+  return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1);
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+svbool_t MaskLowerHalf(D /*d*/) {
+  return svptrue_pat_b64(SV_VL1);
+}
+#endif  // HWY_TARGET == HWY_SVE2_128
+#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
 template <class D>
 svbool_t MaskLowerHalf(D d) {
   return FirstN(d, Lanes(d) / 2);
 }
+#endif
+
 template <class D>
 svbool_t MaskUpperHalf(D d) {
+  // TODO(janwas): WHILEGE on pow2 SVE2
+  if (HWY_SVE_IS_POW2 && IsFull(d)) {
+    return Not(MaskLowerHalf(d));
+  }
+
   // For Splice to work as intended, make sure bits above Lanes(d) are zero.
   return AndNot(MaskLowerHalf(d), detail::MakeMask(d));
 }
@@ -1475,18 +1734,43 @@ HWY_API V ConcatUpperLower(const D d, co
 // ------------------------------ ConcatLowerLower
 template <class D, class V>
 HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
+  if (detail::IsFull(d)) {
+#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
+    return detail::ConcatEvenBlocks(hi, lo);
+#endif
+#if HWY_TARGET == HWY_SVE2_128
+    const Repartition<uint64_t, D> du64;
+    const auto lo64 = BitCast(du64, lo);
+    return BitCast(d, InterleaveLower(du64, lo64, BitCast(du64, hi)));
+#endif
+  }
   return detail::Splice(hi, lo, detail::MaskLowerHalf(d));
 }
 
 // ------------------------------ ConcatLowerUpper
 template <class D, class V>
 HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
+#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128  // constexpr Lanes
+  if (detail::IsFull(d)) {
+    return detail::Ext<Lanes(d) / 2>(hi, lo);
+  }
+#endif
   return detail::Splice(hi, lo, detail::MaskUpperHalf(d));
 }
 
 // ------------------------------ ConcatUpperUpper
 template <class D, class V>
 HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) {
+  if (detail::IsFull(d)) {
+#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
+    return detail::ConcatOddBlocks(hi, lo);
+#endif
+#if HWY_TARGET == HWY_SVE2_128
+    const Repartition<uint64_t, D> du64;
+    const auto lo64 = BitCast(du64, lo);
+    return BitCast(d, InterleaveUpper(du64, lo64, BitCast(du64, hi)));
+#endif
+  }
   const svbool_t mask_upper = detail::MaskUpperHalf(d);
   const V lo_upper = detail::Splice(lo, lo, mask_upper);
   return IfThenElse(mask_upper, hi, lo_upper);
@@ -1517,10 +1801,62 @@ HWY_API V LowerHalf(const V v) {
 }
 
 template <class D2, class V>
-HWY_API V UpperHalf(const D2 /* d2 */, const V v) {
-  return detail::Splice(v, v, detail::MaskUpperHalf(Twice<D2>()));
+HWY_API V UpperHalf(const D2 d2, const V v) {
+#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128  // constexpr Lanes
+  return detail::Ext<Lanes(d2)>(v, v);
+#else
+  return detail::Splice(v, v, detail::MaskUpperHalf(Twice<decltype(d2)>()));
+#endif
+}
+
+// ================================================== REDUCE
+
+// These return T, whereas the Highway op returns a broadcasted vector.
+namespace detail {
+#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP)                   \
+  HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) {   \
+    /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \
+    using T = HWY_SVE_T(BASE, BITS);                                           \
+    using TU = MakeUnsigned<T>;                                                \
+    constexpr uint64_t kMask = LimitsMax<TU>();                                \
+    return static_cast<T>(static_cast<TU>(                                     \
+        static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask));         \
+  }
+
+#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)                     \
+  HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
+    return sv##OP##_##CHAR##BITS(pg, v);                                     \
+  }
+
+HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanes, addv)
+HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanes, addv)
+
+HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanes, minv)
+HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanes, maxv)
+// NaN if all are
+HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanes, minnmv)
+HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanes, maxnmv)
+
+#undef HWY_SVE_REDUCE
+#undef HWY_SVE_REDUCE_ADD
+}  // namespace detail
+
+template <class D, class V>
+V SumOfLanes(D d, V v) {
+  return Set(d, detail::SumOfLanes(detail::MakeMask(d), v));
+}
+
+template <class D, class V>
+V MinOfLanes(D d, V v) {
+  return Set(d, detail::MinOfLanes(detail::MakeMask(d), v));
+}
+
+template <class D, class V>
+V MaxOfLanes(D d, V v) {
+  return Set(d, detail::MaxOfLanes(detail::MakeMask(d), v));
 }
 
+
 // ================================================== SWIZZLE
 
 // ------------------------------ GetLane
@@ -1579,26 +1915,53 @@ HWY_API V DupOdd(const V v) {
 
 // ------------------------------ OddEven
 
-namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVN, Insert, insr_n)
-}  // namespace detail
+#if HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE2
+
+#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)          \
+  HWY_API HWY_SVE_V(BASE, BITS)                                     \
+      NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \
+    return sv##OP##_##CHAR##BITS(even, odd, /*xor=*/0);             \
+  }
+
+HWY_SVE_FOREACH_UI(HWY_SVE_ODD_EVEN, OddEven, eortb_n)
+#undef HWY_SVE_ODD_EVEN
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API V OddEven(const V odd, const V even) {
+  const DFromV<V> d;
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(d, OddEven(BitCast(du, odd), BitCast(du, even)));
+}
+
+#else
 
 template <class V>
 HWY_API V OddEven(const V odd, const V even) {
-  const auto even_in_odd = detail::Insert(even, 0);
-  return detail::InterleaveOdd(even_in_odd, odd);
+  const auto odd_in_even = detail::Ext<1>(odd, odd);
+  return detail::InterleaveEven(even, odd_in_even);
 }
 
+#endif  // HWY_TARGET
+
 // ------------------------------ OddEvenBlocks
 template <class V>
 HWY_API V OddEvenBlocks(const V odd, const V even) {
-  const RebindToUnsigned<DFromV<V>> du;
+  const DFromV<V> d;
+#if HWY_TARGET == HWY_SVE_256
+  return ConcatUpperLower(d, odd, even);
+#elif HWY_TARGET == HWY_SVE2_128
+  (void)odd;
+  (void)d;
+  return even;
+#else
+  const RebindToUnsigned<decltype(d)> du;
   using TU = TFromD<decltype(du)>;
   constexpr size_t kShift = CeilLog2(16 / sizeof(TU));
   const auto idx_block = ShiftRight<kShift>(Iota(du, 0));
   const auto lsb = detail::AndN(idx_block, static_cast<TU>(1));
   const svbool_t is_even = detail::EqN(lsb, static_cast<TU>(0));
   return IfThenElse(is_even, even, odd);
+#endif
 }
 
 // ------------------------------ TableLookupLanes
@@ -1648,11 +2011,18 @@ constexpr size_t LanesPerBlock(Simd<T, N
 template <class V>
 HWY_API V SwapAdjacentBlocks(const V v) {
   const DFromV<V> d;
+#if HWY_TARGET == HWY_SVE_256
+  return ConcatLowerUpper(d, v, v);
+#elif HWY_TARGET == HWY_SVE2_128
+  (void)d;
+  return v;
+#else
   const RebindToUnsigned<decltype(d)> du;
   constexpr auto kLanesPerBlock =
       static_cast<TFromV<V>>(detail::LanesPerBlock(d));
   const VFromD<decltype(du)> idx = detail::XorN(Iota(du, 0), kLanesPerBlock);
   return TableLookupLanes(v, idx);
+#endif
 }
 
 // ------------------------------ Reverse
@@ -1675,11 +2045,14 @@ HWY_API V Reverse(D d, V v) {
   const auto reversed = detail::ReverseFull(v);
   if (HWY_SVE_IS_POW2 && detail::IsFull(d)) return reversed;
   // Shift right to remove extra (non-pow2 and remainder) lanes.
-  // TODO(janwas): on SVE2, use whilege.
-  const size_t all_lanes = detail::AllHardwareLanes(hwy::SizeTag<sizeof(T)>());
-  // Avoids FirstN truncating to the return vector size.
+  // TODO(janwas): on SVE2, use WHILEGE.
+  // Avoids FirstN truncating to the return vector size. Must also avoid Not
+  // because that is limited to SV_POW2.
   const ScalableTag<T> dfull;
-  const svbool_t mask = Not(FirstN(dfull, all_lanes - Lanes(d)));
+  const svbool_t all_true = detail::AllPTrue(dfull);
+  const size_t all_lanes = detail::AllHardwareLanes(hwy::SizeTag<sizeof(T)>());
+  const svbool_t mask =
+      svnot_b_z(all_true, FirstN(dfull, all_lanes - Lanes(d)));
   return detail::Splice(reversed, reversed, mask);
 }
 
@@ -1700,14 +2073,23 @@ HWY_API VFromD<D> Reverse2(D d, const VF
 }
 
 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) {  // 3210
-  const auto even_in_odd = detail::Insert(v, 0);              // 210z
-  return detail::InterleaveOdd(v, even_in_odd);               // 2301
+HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {  // 3210
+#if HWY_TARGET == HWY_SVE2_128
+  if (detail::IsFull(d)) {
+    return detail::Ext<1>(v, v);
+  }
+#endif
+  (void)d;
+  const auto odd_in_even = detail::Ext<1>(v, v);  // x321
+  return detail::InterleaveEven(odd_in_even, v);  // 2301
 }
-
 // ------------------------------ Reverse4 (TableLookupLanes)
 template <class D>
 HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
+  if (HWY_TARGET == HWY_SVE_256 && sizeof(TFromD<D>) == 8 &&
+      detail::IsFull(d)) {
+    return detail::ReverseFull(v);
+  }
   // TODO(janwas): is this approach faster than Shuffle0123?
   const RebindToUnsigned<decltype(d)> du;
   const auto idx = detail::XorN(Iota(du, 0), 3);
@@ -1726,7 +2108,13 @@ HWY_API VFromD<D> Reverse8(D d, const VF
 
 template <typename T>
 struct CompressIsPartition {
+#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
+  // Optimization for 64-bit lanes (could also be applied to 32-bit, but that
+  // requires a larger table).
+  enum { value = (sizeof(T) == 8) };
+#else
   enum { value = 0 };
+#endif  // HWY_TARGET == HWY_SVE_256
 };
 
 #define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)                     \
@@ -1734,9 +2122,48 @@ struct CompressIsPartition {
     return sv##OP##_##CHAR##BITS(mask, v);                                     \
   }
 
+#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
+HWY_SVE_FOREACH_UI32(HWY_SVE_COMPRESS, Compress, compact)
+HWY_SVE_FOREACH_F32(HWY_SVE_COMPRESS, Compress, compact)
+#else
 HWY_SVE_FOREACH_UIF3264(HWY_SVE_COMPRESS, Compress, compact)
+#endif
 #undef HWY_SVE_COMPRESS
 
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
+template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
+HWY_API V Compress(V v, svbool_t mask) {
+  const DFromV<V> d;
+  const RebindToUnsigned<decltype(d)> du64;
+
+  // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
+  // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
+  // SetTableIndices.
+  const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
+  const size_t offset = detail::SumOfLanes(mask, bits);
+
+  // See CompressIsPartition.
+  alignas(16) static constexpr uint64_t table[4 * 16] = {
+      // PrintCompress64x4Tables
+      0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2,
+      1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2,
+      0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3};
+  return TableLookupLanes(v, SetTableIndices(d, table + offset));
+}
+#endif  // HWY_TARGET == HWY_SVE_256
+#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
+template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
+HWY_API V Compress(V v, svbool_t mask) {
+  // If mask == 10: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
+  // swaps upper/lower (the lower half is set to the upper half, and the
+  // remaining upper half is filled from the lower half of the second v), and
+  // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot keep 10
+  // unchanged and map everything else to 00.
+  const svbool_t maskLL = svzip1_b64(mask, mask);  // broadcast lower lane
+  return detail::Splice(v, v, AndNot(maskLL, mask));
+}
+#endif  // HWY_TARGET == HWY_SVE_256
+
 template <class V, HWY_IF_LANE_SIZE_V(V, 2)>
 HWY_API V Compress(V v, svbool_t mask16) {
   static_assert(!IsSame<V, svfloat16_t>(), "Must use overload");
@@ -1773,17 +2200,78 @@ HWY_API svfloat16_t Compress(svfloat16_t
   return BitCast(df, Compress(BitCast(di, v), mask16));
 }
 
+// ------------------------------ CompressNot
+
+template <class V, HWY_IF_NOT_LANE_SIZE_V(V, 8)>
+HWY_API V CompressNot(V v, const svbool_t mask) {
+  return Compress(v, Not(mask));
+}
+
+template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
+HWY_API V CompressNot(V v, svbool_t mask) {
+#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
+  // If mask == 01: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
+  // swaps upper/lower (the lower half is set to the upper half, and the
+  // remaining upper half is filled from the lower half of the second v), and
+  // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot map
+  // 01 to 10, and everything else to 00.
+  const svbool_t maskLL = svzip1_b64(mask, mask);  // broadcast lower lane
+  return detail::Splice(v, v, AndNot(mask, maskLL));
+#endif
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
+  const DFromV<V> d;
+  const RebindToUnsigned<decltype(d)> du64;
+
+  // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
+  // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
+  // SetTableIndices.
+  const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
+  const size_t offset = detail::SumOfLanes(mask, bits);
+
+  // See CompressIsPartition.
+  alignas(16) static constexpr uint64_t table[4 * 16] = {
+      // PrintCompressNot64x4Tables
+      0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3,
+      0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3,
+      2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
+  return TableLookupLanes(v, SetTableIndices(d, table + offset));
+#endif  // HWY_TARGET == HWY_SVE_256
+
+  return Compress(v, Not(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) {
+#if HWY_TARGET == HWY_SVE2_128
+  (void)mask;
+  return v;
+#endif
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
+  uint64_t bits = 0;  // predicate reg is 32-bit
+  CopyBytes<4>(&mask, &bits);
+  // Concatenate LSB for upper and lower blocks, pre-scale by 4 for table idx.
+  const size_t offset = ((bits & 1) ? 4 : 0) + ((bits & 0x10000) ? 8 : 0);
+  // See CompressIsPartition. Manually generated; flip halves if mask = [0, 1].
+  alignas(16) static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1,
+                                                        0, 1, 2, 3, 0, 1, 2, 3};
+  const ScalableTag<uint64_t> d;
+  return TableLookupLanes(v, SetTableIndices(d, table + offset));
+#endif
+
+  return CompressNot(v, mask);
+}
+
 // ------------------------------ CompressStore
-template <class V, class M, class D>
-HWY_API size_t CompressStore(const V v, const M mask, const D d,
+template <class V, class D>
+HWY_API size_t CompressStore(const V v, const svbool_t mask, const D d,
                              TFromD<D>* HWY_RESTRICT unaligned) {
   StoreU(Compress(v, mask), d, unaligned);
   return CountTrue(d, mask);
 }
 
 // ------------------------------ CompressBlendedStore
-template <class V, class M, class D>
-HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
+template <class V, class D>
+HWY_API size_t CompressBlendedStore(const V v, const svbool_t mask, const D d,
                                     TFromD<D>* HWY_RESTRICT unaligned) {
   const size_t count = CountTrue(d, mask);
   const svbool_t store_mask = FirstN(d, count);
@@ -1795,6 +2283,9 @@ HWY_API size_t CompressBlendedStore(cons
 
 // ------------------------------ CombineShiftRightBytes
 
+// Prevent accidentally using these for 128-bit vectors - should not be
+// necessary.
+#if HWY_TARGET != HWY_SVE2_128
 namespace detail {
 
 // For x86-compatible behaviour mandated by Highway API: TableLookupBytes
@@ -1847,16 +2338,21 @@ svbool_t FirstNPerBlock(D d) {
 }
 
 }  // namespace detail
+#endif  // HWY_TARGET != HWY_SVE2_128
 
 template <size_t kBytes, class D, class V = VFromD<D>>
 HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) {
   const Repartition<uint8_t, decltype(d)> d8;
   const auto hi8 = BitCast(d8, hi);
   const auto lo8 = BitCast(d8, lo);
+#if HWY_TARGET == HWY_SVE2_128
+  return BitCast(d, detail::Ext<kBytes>(hi8, lo8));
+#else
   const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes));
   const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
   const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
   return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
+#endif
 }
 
 // ------------------------------ Shuffle2301
@@ -1916,6 +2412,16 @@ HWY_API V Shuffle0123(const V v) {
 // ------------------------------ ReverseBlocks (Reverse, Shuffle01)
 template <class D, class V = VFromD<D>>
 HWY_API V ReverseBlocks(D d, V v) {
+#if HWY_TARGET == HWY_SVE_256
+  if (detail::IsFull(d)) {
+    return SwapAdjacentBlocks(v);
+  } else if (detail::IsFull(Twice<D>())) {
+    return v;
+  }
+#elif HWY_TARGET == HWY_SVE2_128
+  (void)d;
+  return v;
+#endif
   const Repartition<uint64_t, D> du64;
   return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v))));
 }
@@ -1926,9 +2432,13 @@ template <class V, class VI>
 HWY_API VI TableLookupBytes(const V v, const VI idx) {
   const DFromV<VI> d;
   const Repartition<uint8_t, decltype(d)> du8;
+#if HWY_TARGET == HWY_SVE2_128
+  return BitCast(d, TableLookupLanes(BitCast(du8, v), BitCast(du8, idx)));
+#else
   const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0));
   const auto idx8 = Add(BitCast(du8, idx), offsets128);
   return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8));
+#endif
 }
 
 template <class V, class VI>
@@ -1945,17 +2455,35 @@ HWY_API VI TableLookupBytesOr0(const V v
 }
 
 // ------------------------------ Broadcast
+
+#if HWY_TARGET == HWY_SVE2_128
+namespace detail {
+#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP)        \
+  template <int kLane>                                             \
+  HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
+    return sv##OP##_##CHAR##BITS(v, kLane);                        \
+  }
+
+HWY_SVE_FOREACH(HWY_SVE_BROADCAST, Broadcast, dup_lane)
+#undef HWY_SVE_BROADCAST
+}  // namespace detail
+#endif
+
 template <int kLane, class V>
 HWY_API V Broadcast(const V v) {
   const DFromV<V> d;
   const RebindToUnsigned<decltype(d)> du;
   constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
   static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
+#if HWY_TARGET == HWY_SVE2_128
+  return detail::Broadcast<kLane>(v);
+#else
   auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
   if (kLane != 0) {
     idx = detail::AddN(idx, kLane);
   }
   return TableLookupLanes(v, idx);
+#endif
 }
 
 // ------------------------------ ShiftLeftLanes
@@ -1964,8 +2492,12 @@ template <size_t kLanes, class D, class
 HWY_API V ShiftLeftLanes(D d, const V v) {
   const auto zero = Zero(d);
   const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes));
+#if HWY_TARGET == HWY_SVE2_128
+  return shifted;
+#else
   // Match x86 semantics by zeroing lower lanes in 128-bit blocks
   return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted);
+#endif
 }
 
 template <size_t kLanes, class V>
@@ -1981,11 +2513,15 @@ HWY_API V ShiftRightLanes(D d, V v) {
     v = IfThenElseZero(detail::MakeMask(d), v);
   }
 
+#if HWY_TARGET == HWY_SVE2_128
+  return detail::Ext<kLanes>(Zero(d), v);
+#else
   const auto shifted = detail::Ext<kLanes>(v, v);
   // Match x86 semantics by zeroing upper lanes in 128-bit blocks
   constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
   const svbool_t mask = detail::FirstNPerBlock<kLanesPerBlock - kLanes>(d);
   return IfThenElseZero(mask, shifted);
+#endif
 }
 
 // ------------------------------ ShiftLeftBytes
@@ -2008,53 +2544,6 @@ HWY_API V ShiftRightBytes(const D d, con
   return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
 }
 
-// ------------------------------ InterleaveLower
-
-template <class D, class V>
-HWY_API V InterleaveLower(D d, const V a, const V b) {
-  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
-  // Move lower halves of blocks to lower half of vector.
-  const Repartition<uint64_t, decltype(d)> d64;
-  const auto a64 = BitCast(d64, a);
-  const auto b64 = BitCast(d64, b);
-  const auto a_blocks = detail::ConcatEven(a64, a64);  // only lower half needed
-  const auto b_blocks = detail::ConcatEven(b64, b64);
-
-  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
-}
-
-template <class V>
-HWY_API V InterleaveLower(const V a, const V b) {
-  return InterleaveLower(DFromV<V>(), a, b);
-}
-
-// ------------------------------ InterleaveUpper
-
-// Full vector: guaranteed to have at least one block
-template <class D, class V = VFromD<D>,
-          hwy::EnableIf<detail::IsFull(D())>* = nullptr>
-HWY_API V InterleaveUpper(D d, const V a, const V b) {
-  // Move upper halves of blocks to lower half of vector.
-  const Repartition<uint64_t, decltype(d)> d64;
-  const auto a64 = BitCast(d64, a);
-  const auto b64 = BitCast(d64, b);
-  const auto a_blocks = detail::ConcatOdd(a64, a64);  // only lower half needed
-  const auto b_blocks = detail::ConcatOdd(b64, b64);
-  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
-}
-
-// Capped/fraction: need runtime check
-template <class D, class V = VFromD<D>,
-          hwy::EnableIf<!detail::IsFull(D())>* = nullptr>
-HWY_API V InterleaveUpper(D d, const V a, const V b) {
-  // Less than one block: treat as capped
-  if (Lanes(d) * sizeof(TFromD<D>) < 16) {
-    const Half<decltype(d)> d2;
-    return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
-  }
-  return InterleaveUpper(DFromV<V>(), a, b);
-}
-
 // ------------------------------ ZipLower
 
 template <class V, class DW = RepartitionToWide<DFromV<V>>>
@@ -2076,25 +2565,6 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a,
   return BitCast(dw, InterleaveUpper(dn, a, b));
 }
 
-// ================================================== REDUCE
-
-#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)                 \
-  template <size_t N, int kPow2>                                         \
-  HWY_API HWY_SVE_V(BASE, BITS)                                          \
-      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, HWY_SVE_V(BASE, BITS) v) { \
-    return Set(d, static_cast<HWY_SVE_T(BASE, BITS)>(                    \
-                      sv##OP##_##CHAR##BITS(detail::MakeMask(d), v)));   \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_REDUCE, SumOfLanes, addv)
-HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanes, minv)
-HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanes, maxv)
-// NaN if all are
-HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanes, minnmv)
-HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanes, maxnmv)
-
-#undef HWY_SVE_REDUCE
-
 // ================================================== Ops with dependencies
 
 // ------------------------------ PromoteTo bfloat16 (ZipLower)
@@ -2261,6 +2731,7 @@ HWY_INLINE svuint64_t BitsFromBool(svuin
 }  // namespace detail
 
 // `p` points to at least 8 writable bytes.
+// TODO(janwas): specialize for HWY_SVE_256
 template <class D>
 HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
   svuint64_t bits_in_u64 =
@@ -2275,7 +2746,7 @@ HWY_API size_t StoreMaskBits(D d, svbool
   // Non-full byte, need to clear the undefined upper bits. Can happen for
   // capped/fractional vectors or large T and small hardware vectors.
   if (num_bits < 8) {
-    const int mask = (1 << num_bits) - 1;
+    const int mask = (1ull << num_bits) - 1;
     bits[0] = static_cast<uint8_t>(bits[0] & mask);
   }
   // Else: we wrote full bytes because num_bits is a power of two >= 8.
@@ -2354,7 +2825,9 @@ HWY_API svfloat32_t ReorderWidenMulAccum
 
 // ------------------------------ AESRound / CLMul
 
-#if defined(__ARM_FEATURE_SVE2_AES)
+#if defined(__ARM_FEATURE_SVE2_AES) ||                         \
+    ((HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128) && \
+     HWY_HAVE_RUNTIME_DISPATCH)
 
 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
 #ifdef HWY_NATIVE_AES
@@ -2384,48 +2857,93 @@ HWY_API svuint64_t CLMulUpper(const svui
 #endif  // __ARM_FEATURE_SVE2_AES
 
 // ------------------------------ Lt128
+
+namespace detail {
+#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP)                        \
+  template <size_t N, int kPow2>                                             \
+  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, svbool_t m) { \
+    return sv##OP##_b##BITS(m, m);                                           \
+  }
+
+HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1)  // actually for bool
+HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2)   // actually for bool
+#undef HWY_SVE_DUP
+
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
+template <class D>
+HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  const svbool_t eqHx = Eq(a, b);  // only odd lanes used
+  // Convert to vector: more pipelines can TRN* for vectors than predicates.
+  const svuint64_t ltHL = VecFromMask(d, Lt(a, b));
+  // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
+  // Requires an extra IfThenElse because INSR, EXT, TRN2 are unpredicated.
+  const svuint64_t ltHx = IfThenElse(eqHx, DupEven(ltHL), ltHL);
+  // Duplicate upper lane into lower.
+  return DupOdd(ltHx);
+}
+#endif
+}  // namespace detail
+
+template <class D>
+HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
+#if HWY_TARGET == HWY_SVE_256
+  return MaskFromVec(detail::Lt128Vec(d, a, b));
+#else
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  const svbool_t eqHx = Eq(a, b);  // only odd lanes used
+  const svbool_t ltHL = Lt(a, b);
+  // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
+  const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(d, ltHL), ltHL);
+  // Duplicate upper lane into lower.
+  return detail::DupOddB(d, ltHx);
+#endif  // HWY_TARGET != HWY_SVE_256
+}
+
+// ------------------------------ Lt128Upper
+
 template <class D>
-HWY_INLINE svbool_t Lt128(D /* d */, const svuint64_t a, const svuint64_t b) {
+HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) {
   static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  // Truth table of Eq and Compare for Hi and Lo u64.
-  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
-  // =H =L cH cL  | out = cH | (=H & cL) = IfThenElse(=H, cL, cH)
-  //  0  0  0  0  |  0
-  //  0  0  0  1  |  0
-  //  0  0  1  0  |  1
-  //  0  0  1  1  |  1
-  //  0  1  0  0  |  0
-  //  0  1  0  1  |  0
-  //  0  1  1  0  |  1
-  //  1  0  0  0  |  0
-  //  1  0  0  1  |  1
-  //  1  1  0  0  |  0
-  const svbool_t eqHL = Eq(a, b);
   const svbool_t ltHL = Lt(a, b);
-  // trn (interleave even/odd) allow us to move and copy masks across lanes.
-  const svbool_t cmpLL = svtrn1_b64(ltHL, ltHL);
-  const svbool_t outHx = svsel_b(eqHL, cmpLL, ltHL);  // See truth table above.
-  return svtrn2_b64(outHx, outHx);                    // replicate to HH
+  return detail::DupOddB(d, ltHL);
 }
 
 // ------------------------------ Min128, Max128 (Lt128)
 
 template <class D>
 HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) {
+#if HWY_TARGET == HWY_SVE_256
+  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
+#else
   return IfThenElse(Lt128(d, a, b), a, b);
+#endif
 }
 
 template <class D>
 HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) {
-  return IfThenElse(Lt128(d, a, b), b, a);
+#if HWY_TARGET == HWY_SVE_256
+  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
+#else
+  return IfThenElse(Lt128(d, b, a), a, b);
+#endif
+}
+
+template <class D>
+HWY_INLINE svuint64_t Min128Upper(D d, const svuint64_t a, const svuint64_t b) {
+  return IfThenElse(Lt128Upper(d, a, b), a, b);
+}
+
+template <class D>
+HWY_INLINE svuint64_t Max128Upper(D d, const svuint64_t a, const svuint64_t b) {
+  return IfThenElse(Lt128Upper(d, b, a), a, b);
 }
 
 // ================================================== END MACROS
 namespace detail {  // for code folding
 #undef HWY_IF_FLOAT_V
 #undef HWY_IF_LANE_SIZE_V
-#undef HWY_IF_SIGNED_V
-#undef HWY_IF_UNSIGNED_V
+#undef HWY_SVE_ALL_PTRUE
 #undef HWY_SVE_D
 #undef HWY_SVE_FOREACH
 #undef HWY_SVE_FOREACH_F
diff -pruN 0.17.0-11/hwy/ops/emu128-inl.h 1.0.0-2/hwy/ops/emu128-inl.h
--- 0.17.0-11/hwy/ops/emu128-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/emu128-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -135,13 +135,13 @@ HWY_API Vec128<T, N> Undefined(Simd<T, N
 
 namespace detail {
 
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_INLINE constexpr T IncrementWithWraparound(T t) {
+template <typename T>
+HWY_INLINE constexpr T IncrementWithWraparound(hwy::FloatTag /*tag*/, T t) {
   return t + T{1};
 }
 
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_INLINE constexpr T IncrementWithWraparound(T t) {
+template <typename T>
+HWY_INLINE constexpr T IncrementWithWraparound(hwy::NonFloatTag /*tag*/, T t) {
   using TU = MakeUnsigned<T>;
   return static_cast<T>(static_cast<TU>(static_cast<TU>(t) + TU{1}) &
                         hwy::LimitsMax<TU>());
@@ -155,7 +155,7 @@ HWY_API Vec128<T, N> Iota(const Simd<T,
   T counter = static_cast<T>(first);
   for (size_t i = 0; i < N; ++i) {
     v.raw[i] = counter;
-    counter = detail::IncrementWithWraparound(counter);
+    counter = detail::IncrementWithWraparound(hwy::IsFloatTag<T>(), counter);
   }
   return v;
 }
@@ -544,8 +544,12 @@ HWY_API Vec128<T, N> operator>>(Vec128<T
 
 // ================================================== ARITHMETIC
 
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Add(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
+                            Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
     const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
@@ -553,31 +557,46 @@ HWY_API Vec128<T, N> operator+(Vec128<T,
   }
   return a;
 }
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> operator+(Vec128<T, N> a, const Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Sub(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
+                            Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    a.raw[i] += b.raw[i];
+    const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
+    const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
+    a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
   }
   return a;
 }
 
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Add(hwy::FloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
-    const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
-    a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
+    a.raw[i] += b.raw[i];
   }
   return a;
 }
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> operator-(Vec128<T, N> a, const Vec128<T, N> b) {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Sub(hwy::FloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     a.raw[i] -= b.raw[i];
   }
   return a;
 }
 
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator-(Vec128<T, N> a, const Vec128<T, N> b) {
+  return detail::Sub(hwy::IsFloatTag<T>(), a, b);
+}
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator+(Vec128<T, N> a, const Vec128<T, N> b) {
+  return detail::Add(hwy::IsFloatTag<T>(), a, b);
+}
+
 // ------------------------------ SumsOf8
 
 template <size_t N>
@@ -612,8 +631,9 @@ HWY_API Vec128<T, N> SaturatedSub(Vec128
 }
 
 // ------------------------------ AverageRound
-template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, const Vec128<T, N> b) {
+  static_assert(!IsSigned<T>(), "Only for unsigned");
   for (size_t i = 0; i < N; ++i) {
     a.raw[i] = static_cast<T>((a.raw[i] + b.raw[i] + 1) / 2);
   }
@@ -622,8 +642,11 @@ HWY_API Vec128<T, N> AverageRound(Vec128
 
 // ------------------------------ Abs
 
-template <typename T, size_t N, HWY_IF_SIGNED(T)>
-HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Abs(SignedTag /*tag*/, Vec128<T, N> a) {
   for (size_t i = 0; i < N; ++i) {
     const T s = a.raw[i];
     const T min = hwy::LimitsMin<T>();
@@ -631,26 +654,47 @@ HWY_API Vec128<T, N> Abs(Vec128<T, N> a)
   }
   return a;
 }
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Abs(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
   for (size_t i = 0; i < N; ++i) {
     v.raw[i] = std::abs(v.raw[i]);
   }
   return v;
 }
 
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
+  return detail::Abs(hwy::TypeTag<T>(), a);
+}
+
 // ------------------------------ Min/Max
 
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> Min(Vec128<T, N> a, const Vec128<T, N> b) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Min(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
   }
   return a;
 }
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Max(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
+  for (size_t i = 0; i < N; ++i) {
+    a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
+  }
+  return a;
+}
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Min(Vec128<T, N> a, const Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     if (std::isnan(a.raw[i])) {
       a.raw[i] = b.raw[i];
@@ -662,17 +706,9 @@ HWY_API Vec128<T, N> Min(Vec128<T, N> a,
   }
   return a;
 }
-
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> Max(Vec128<T, N> a, const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
-  }
-  return a;
-}
-
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Max(Vec128<T, N> a, const Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     if (std::isnan(a.raw[i])) {
       a.raw[i] = b.raw[i];
@@ -685,44 +721,79 @@ HWY_API Vec128<T, N> Max(Vec128<T, N> a,
   return a;
 }
 
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Min(Vec128<T, N> a, const Vec128<T, N> b) {
+  return detail::Min(hwy::IsFloatTag<T>(), a, b);
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Max(Vec128<T, N> a, const Vec128<T, N> b) {
+  return detail::Max(hwy::IsFloatTag<T>(), a, b);
+}
+
 // ------------------------------ Neg
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, Vec128<T, N> v) {
+  return Zero(Simd<T, N, 0>()) - v;
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Neg(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
   return Xor(v, SignBit(Simd<T, N, 0>()));
 }
 
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
+}  // namespace detail
+
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
-  return Zero(Simd<T, N, 0>()) - v;
+  return detail::Neg(hwy::IsFloatTag<T>(), v);
 }
 
 // ------------------------------ Mul/Div
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Mul(hwy::FloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     a.raw[i] *= b.raw[i];
   }
   return a;
 }
 
-template <typename T, size_t N, HWY_IF_SIGNED(T)>
-HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Mul(SignedTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<T>(int64_t(a.raw[i]) * b.raw[i]);
+    a.raw[i] = static_cast<T>(static_cast<int64_t>(a.raw[i]) * b.raw[i]);
   }
   return a;
 }
 
-template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
-HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Mul(UnsignedTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<T>(uint64_t(a.raw[i]) * b.raw[i]);
+    a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) * b.raw[i]);
   }
   return a;
 }
 
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
+  return detail::Mul(hwy::TypeTag<T>(), a, b);
+}
+
 template <typename T, size_t N>
 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
@@ -736,7 +807,7 @@ template <size_t N>
 HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a,
                                    const Vec128<int16_t, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<int16_t>((a.raw[i] * b.raw[i]) >> 16);
+    a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
   }
   return a;
 }
@@ -1048,8 +1119,9 @@ HWY_API Mask128<T, N> IsNaN(const Vec128
   return ret;
 }
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> d;
   const RebindToSigned<decltype(d)> di;
   const VFromD<decltype(di)> vi = BitCast(di, v);
@@ -1058,8 +1130,9 @@ HWY_API Mask128<T, N> IsInf(const Vec128
 }
 
 // Returns whether normal/subnormal/zero.
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> d;
   const RebindToUnsigned<decltype(d)> du;
   const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
@@ -1146,6 +1219,15 @@ HWY_API Mask128<uint64_t> Lt128(Simd<uin
   return ret;
 }
 
+HWY_API Mask128<uint64_t> Lt128Upper(Simd<uint64_t, 2, 0> /* tag */,
+                                     Vec128<uint64_t> a,
+                                     const Vec128<uint64_t> b) {
+  const bool lt = a.raw[1] < b.raw[1];
+  Mask128<uint64_t> ret;
+  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
+  return ret;
+}
+
 // ------------------------------ Min128, Max128 (Lt128)
 
 template <class D, class V = VFromD<D>>
@@ -1155,7 +1237,17 @@ HWY_API V Min128(D d, const V a, const V
 
 template <class D, class V = VFromD<D>>
 HWY_API V Max128(D d, const V a, const V b) {
-  return IfThenElse(Lt128(d, a, b), b, a);
+  return IfThenElse(Lt128(d, b, a), a, b);
+}
+
+template <class D, class V = VFromD<D>>
+HWY_API V Min128Upper(D d, const V a, const V b) {
+  return IfThenElse(Lt128Upper(d, a, b), a, b);
+}
+
+template <class D, class V = VFromD<D>>
+HWY_API V Max128Upper(D d, const V a, const V b) {
+  return IfThenElse(Lt128Upper(d, b, a), a, b);
 }
 
 // ================================================== MEMORY
@@ -1433,31 +1525,24 @@ HWY_API Vec128<ToT, N> DemoteTo(Simd<ToT
 template <size_t N>
 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
     Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
-  const RebindToUnsigned<decltype(dbf16)> du16;
   const Repartition<uint32_t, decltype(dbf16)> du32;
-  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
+  const Vec128<uint32_t, N> b_in_lower = ShiftRight<16>(BitCast(du32, b));
+  // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
+  const Vec128<uint32_t, N> a_mask = Set(du32, 0xFFFF0000);
+  return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
 }
 
 namespace detail {
 
 HWY_INLINE void StoreU16ToF16(const uint16_t val,
                               hwy::float16_t* HWY_RESTRICT to) {
-#if HWY_NATIVE_FLOAT16
   CopyBytes<2>(&val, to);
-#else
-  to->bits = val;
-#endif
 }
 
 HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) {
-#if HWY_NATIVE_FLOAT16
   uint16_t bits16;
   CopyBytes<2>(from, &bits16);
   return bits16;
-#else
-  return from->bits;
-#endif
 }
 
 }  // namespace detail
@@ -1554,8 +1639,12 @@ HWY_API Vec128<bfloat16_t, N> DemoteTo(S
   return ret;
 }
 
-template <typename FromT, typename ToT, size_t N, HWY_IF_FLOAT(FromT)>
-HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> /* tag */,
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename FromT, typename ToT, size_t N>
+HWY_API Vec128<ToT, N> ConvertTo(hwy::FloatTag /*tag*/,
+                                 Simd<ToT, N, 0> /* tag */,
                                  Vec128<FromT, N> from) {
   static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
   Vec128<ToT, N> ret;
@@ -1574,8 +1663,9 @@ HWY_API Vec128<ToT, N> ConvertTo(Simd<To
   return ret;
 }
 
-template <typename FromT, typename ToT, size_t N, HWY_IF_NOT_FLOAT(FromT)>
-HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> /* tag */,
+template <typename FromT, typename ToT, size_t N>
+HWY_API Vec128<ToT, N> ConvertTo(hwy::NonFloatTag /*tag*/,
+                                 Simd<ToT, N, 0> /* tag */,
                                  Vec128<FromT, N> from) {
   static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
   Vec128<ToT, N> ret;
@@ -1586,11 +1676,80 @@ HWY_API Vec128<ToT, N> ConvertTo(Simd<To
   return ret;
 }
 
+}  // namespace detail
+
+template <typename FromT, typename ToT, size_t N>
+HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> d, Vec128<FromT, N> from) {
+  return detail::ConvertTo(hwy::IsFloatTag<FromT>(), d, from);
+}
+
 template <size_t N>
 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
   return DemoteTo(Simd<uint8_t, N, 0>(), v);
 }
 
+// ------------------------------ Truncations
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint64_t, N> v) {
+  Vec128<uint8_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
+  }
+  return ret;
+}
+
+template <size_t N>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+                                       const Vec128<uint64_t, N> v) {
+  Vec128<uint16_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
+  }
+  return ret;
+}
+
+template <size_t N>
+HWY_API Vec128<uint32_t, N> TruncateTo(Simd<uint32_t, N, 0> /* tag */,
+                                       const Vec128<uint64_t, N> v) {
+  Vec128<uint32_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint32_t>(v.raw[i] & 0xFFFFFFFFu);
+  }
+  return ret;
+}
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint32_t, N> v) {
+  Vec128<uint8_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
+  }
+  return ret;
+}
+
+template <size_t N>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+                                       const Vec128<uint32_t, N> v) {
+  Vec128<uint16_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
+  }
+  return ret;
+}
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint16_t, N> v) {
+  Vec128<uint8_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
+  }
+  return ret;
+}
+
 // ================================================== COMBINE
 
 template <typename T, size_t N>
@@ -1908,15 +2067,17 @@ HWY_API Vec128<T, N> Reverse8(Simd<T, N,
 // ------------------------------ Shuffle*
 
 // Swap 32-bit halves in 64-bit halves.
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
+  static_assert(sizeof(T) == 4, "Only for 32-bit");
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Reverse2(DFromV<decltype(v)>(), v);
 }
 
 // Swap 64-bit halves
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+template <typename T>
 HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
+  static_assert(sizeof(T) == 4, "Only for 32-bit");
   Vec128<T> ret;
   ret.raw[3] = v.raw[1];
   ret.raw[2] = v.raw[0];
@@ -1924,8 +2085,9 @@ HWY_API Vec128<T> Shuffle1032(const Vec1
   ret.raw[0] = v.raw[2];
   return ret;
 }
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+template <typename T>
 HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
+  static_assert(sizeof(T) == 8, "Only for 64-bit");
   return Reverse2(DFromV<decltype(v)>(), v);
 }
 
@@ -2136,6 +2298,31 @@ HWY_API Vec128<T, N> Compress(Vec128<T,
   return ret;
 }
 
+// ------------------------------ CompressNot
+template <typename T, size_t N>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, const Mask128<T, N> mask) {
+  size_t count = 0;
+  Vec128<T, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    if (!mask.bits[i]) {
+      ret.raw[count++] = v.raw[i];
+    }
+  }
+  for (size_t i = 0; i < N; ++i) {
+    if (mask.bits[i]) {
+      ret.raw[count++] = v.raw[i];
+    }
+  }
+  HWY_DASSERT(count == N);
+  return ret;
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+                                           Mask128<uint64_t> /* m */) {
+  return v;
+}
+
 // ------------------------------ CompressBits
 template <typename T, size_t N>
 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
@@ -2182,13 +2369,12 @@ HWY_API Vec128<float, N> ReorderWidenMul
                                                    Vec128<bfloat16_t, 2 * N> b,
                                                    const Vec128<float, N> sum0,
                                                    Vec128<float, N>& sum1) {
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
-  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
+  const Rebind<bfloat16_t, decltype(df32)> dbf16;
+  // Avoid ZipLower/Upper so this also works on big-endian systems.
+  const Vec128<float, N> a0 = PromoteTo(df32, LowerHalf(dbf16, a));
+  const Vec128<float, N> a1 = PromoteTo(df32, UpperHalf(dbf16, a));
+  const Vec128<float, N> b0 = PromoteTo(df32, LowerHalf(dbf16, b));
+  const Vec128<float, N> b1 = PromoteTo(df32, UpperHalf(dbf16, b));
   sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }
diff -pruN 0.17.0-11/hwy/ops/generic_ops-inl.h 1.0.0-2/hwy/ops/generic_ops-inl.h
--- 0.17.0-11/hwy/ops/generic_ops-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/generic_ops-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -1192,10 +1192,11 @@ HWY_API V CLMulUpper(V a, V b) {
 
 // This algorithm requires vectors to be at least 16 bytes, which is the case
 // for LMUL >= 2. If not, use the fallback below.
-template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_GE128_D(DFromV<V>),
-          HWY_IF_POW2_GE(DFromV<V>, HWY_MIN_POW2_FOR_128)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
+          HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
+  const D d;
   HWY_ALIGN constexpr uint8_t kLookup[16] = {
       0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
   };
@@ -1208,9 +1209,10 @@ HWY_API V PopulationCount(V v) {
 // RVV has a specialization that avoids the Set().
 #if HWY_TARGET != HWY_RVV
 // Slower fallback for capped vectors.
-template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_LT128_D(DFromV<V>)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1), HWY_IF_LT128_D(D)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
+  const D d;
   // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
   v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
   v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
@@ -1218,26 +1220,29 @@ HWY_API V PopulationCount(V v) {
 }
 #endif  // HWY_TARGET != HWY_RVV
 
-template <typename V, HWY_IF_LANES_ARE(uint16_t, V)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 2)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint16_t>(), "V must be u16");
+  const D d;
   const Repartition<uint8_t, decltype(d)> d8;
   const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
   return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
 }
 
-template <typename V, HWY_IF_LANES_ARE(uint32_t, V)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint32_t>(), "V must be u32");
+  const D d;
   Repartition<uint16_t, decltype(d)> d16;
   auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
   return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
 }
 
 #if HWY_HAVE_INTEGER64
-template <typename V, HWY_IF_LANES_ARE(uint64_t, V)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint64_t>(), "V must be u64");
+  const D d;
   Repartition<uint32_t, decltype(d)> d32;
   auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
   return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
diff -pruN 0.17.0-11/hwy/ops/rvv-inl.h 1.0.0-2/hwy/ops/rvv-inl.h
--- 0.17.0-11/hwy/ops/rvv-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/rvv-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -1087,9 +1087,9 @@ HWY_API auto TestBit(const V a, const V
 }
 
 // ------------------------------ Not
+// NOLINTNEXTLINE
 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, Not, not )
 
-
 // ------------------------------ And
 
 // mask = f(mask_a, mask_b) (note arg2,arg1 order!)
@@ -1690,6 +1690,249 @@ HWY_API vuint8m2_t U8FromU32(const vuint
   return vnclipu_wx_u8m2(vnclipu_wx_u16m4(v, 0, avl), 0, avl);
 }
 
+// ------------------------------ Truncations
+
+template <size_t N>
+HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
+                               const VFromD<Simd<uint64_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m1_t v1 = vand(v, 0xFF, avl);
+  const vuint32mf2_t v2 = vnclipu_wx_u32mf2(v1, 0, avl);
+  const vuint16mf4_t v3 = vnclipu_wx_u16mf4(v2, 0, avl);
+  return vnclipu_wx_u8mf8(v3, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
+                               const VFromD<Simd<uint64_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m2_t v1 = vand(v, 0xFF, avl);
+  const vuint32m1_t v2 = vnclipu_wx_u32m1(v1, 0, avl);
+  const vuint16mf2_t v3 = vnclipu_wx_u16mf2(v2, 0, avl);
+  return vnclipu_wx_u8mf4(v3, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
+                               const VFromD<Simd<uint64_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m4_t v1 = vand(v, 0xFF, avl);
+  const vuint32m2_t v2 = vnclipu_wx_u32m2(v1, 0, avl);
+  const vuint16m1_t v3 = vnclipu_wx_u16m1(v2, 0, avl);
+  return vnclipu_wx_u8mf2(v3, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
+                              const VFromD<Simd<uint64_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m8_t v1 = vand(v, 0xFF, avl);
+  const vuint32m4_t v2 = vnclipu_wx_u32m4(v1, 0, avl);
+  const vuint16m2_t v3 = vnclipu_wx_u16m2(v2, 0, avl);
+  return vnclipu_wx_u8m1(v3, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d,
+                                const VFromD<Simd<uint64_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m1_t v1 = vand(v, 0xFFFF, avl);
+  const vuint32mf2_t v2 = vnclipu_wx_u32mf2(v1, 0, avl);
+  return vnclipu_wx_u16mf4(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d,
+                                const VFromD<Simd<uint64_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m2_t v1 = vand(v, 0xFFFF, avl);
+  const vuint32m1_t v2 = vnclipu_wx_u32m1(v1, 0, avl);
+  return vnclipu_wx_u16mf2(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d,
+                               const VFromD<Simd<uint64_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m4_t v1 = vand(v, 0xFFFF, avl);
+  const vuint32m2_t v2 = vnclipu_wx_u32m2(v1, 0, avl);
+  return vnclipu_wx_u16m1(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d,
+                               const VFromD<Simd<uint64_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m8_t v1 = vand(v, 0xFFFF, avl);
+  const vuint32m4_t v2 = vnclipu_wx_u32m4(v1, 0, avl);
+  return vnclipu_wx_u16m2(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint32mf2_t TruncateTo(Simd<uint32_t, N, -1> d,
+                                const VFromD<Simd<uint64_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m1_t v1 = vand(v, 0xFFFFFFFFu, avl);
+  return vnclipu_wx_u32mf2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint32m1_t TruncateTo(Simd<uint32_t, N, 0> d,
+                               const VFromD<Simd<uint64_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m2_t v1 = vand(v, 0xFFFFFFFFu, avl);
+  return vnclipu_wx_u32m1(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint32m2_t TruncateTo(Simd<uint32_t, N, 1> d,
+                               const VFromD<Simd<uint64_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m4_t v1 = vand(v, 0xFFFFFFFFu, avl);
+  return vnclipu_wx_u32m2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint32m4_t TruncateTo(Simd<uint32_t, N, 2> d,
+                               const VFromD<Simd<uint64_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m8_t v1 = vand(v, 0xFFFFFFFFu, avl);
+  return vnclipu_wx_u32m4(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
+                               const VFromD<Simd<uint32_t, N, -1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32mf2_t v1 = vand(v, 0xFF, avl);
+  const vuint16mf4_t v2 = vnclipu_wx_u16mf4(v1, 0, avl);
+  return vnclipu_wx_u8mf8(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
+                               const VFromD<Simd<uint32_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m1_t v1 = vand(v, 0xFF, avl);
+  const vuint16mf2_t v2 = vnclipu_wx_u16mf2(v1, 0, avl);
+  return vnclipu_wx_u8mf4(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
+                               const VFromD<Simd<uint32_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m2_t v1 = vand(v, 0xFF, avl);
+  const vuint16m1_t v2 = vnclipu_wx_u16m1(v1, 0, avl);
+  return vnclipu_wx_u8mf2(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
+                              const VFromD<Simd<uint32_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m4_t v1 = vand(v, 0xFF, avl);
+  const vuint16m2_t v2 = vnclipu_wx_u16m2(v1, 0, avl);
+  return vnclipu_wx_u8m1(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d,
+                              const VFromD<Simd<uint32_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m8_t v1 = vand(v, 0xFF, avl);
+  const vuint16m4_t v2 = vnclipu_wx_u16m4(v1, 0, avl);
+  return vnclipu_wx_u8m2(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d,
+                                const VFromD<Simd<uint32_t, N, -1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32mf2_t v1 = vand(v, 0xFFFF, avl);
+  return vnclipu_wx_u16mf4(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d,
+                                const VFromD<Simd<uint32_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m1_t v1 = vand(v, 0xFFFF, avl);
+  return vnclipu_wx_u16mf2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d,
+                               const VFromD<Simd<uint32_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m2_t v1 = vand(v, 0xFFFF, avl);
+  return vnclipu_wx_u16m1(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d,
+                               const VFromD<Simd<uint32_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m4_t v1 = vand(v, 0xFFFF, avl);
+  return vnclipu_wx_u16m2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16m4_t TruncateTo(Simd<uint16_t, N, 2> d,
+                               const VFromD<Simd<uint32_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m8_t v1 = vand(v, 0xFFFF, avl);
+  return vnclipu_wx_u16m4(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
+                               const VFromD<Simd<uint16_t, N, -2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16mf4_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8mf8(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
+                               const VFromD<Simd<uint16_t, N, -1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16mf2_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8mf4(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
+                               const VFromD<Simd<uint16_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16m1_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8mf2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
+                              const VFromD<Simd<uint16_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16m2_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8m1(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d,
+                              const VFromD<Simd<uint16_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16m4_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8m2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m4_t TruncateTo(Simd<uint8_t, N, 2> d,
+                              const VFromD<Simd<uint16_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16m8_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8m4(v1, 0, avl);
+}
+
 // ------------------------------ DemoteTo I
 
 HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
@@ -1823,7 +2066,8 @@ template <size_t kLanes, class D>
 HWY_INLINE MFromD<D> FirstNPerBlock(D /* tag */) {
   const RebindToUnsigned<D> du;
   const RebindToSigned<D> di;
-  const auto idx_mod = AndS(Iota0(du), LanesPerBlock(du) - 1);
+  using TU = TFromD<decltype(du)>;
+  const auto idx_mod = AndS(Iota0(du), static_cast<TU>(LanesPerBlock(du) - 1));
   return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes));
 }
 
@@ -2190,6 +2434,18 @@ HWY_RVV_FOREACH_UI163264(HWY_RVV_COMPRES
 HWY_RVV_FOREACH_F(HWY_RVV_COMPRESS, Compress, compress, _ALL)
 #undef HWY_RVV_COMPRESS
 
+// ------------------------------ CompressNot
+template <class V, class M>
+HWY_API V CompressNot(V v, const M mask) {
+  return Compress(v, Not(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+template <class V, class M>
+HWY_API V CompressBlocksNot(V v, const M mask) {
+  return CompressNot(v, mask);
+}
+
 // ------------------------------ CompressStore
 template <class V, class M, class D>
 HWY_API size_t CompressStore(const V v, const M mask, const D d,
@@ -2346,7 +2602,7 @@ HWY_API VI TableLookupBytes(const VT vt,
   // If the table is shorter, wrap around offsets so they do not reference
   // undefined lanes in the newly extended vmt.
   if (kPow2T < kPow2I) {
-    offsets = detail::AndS(offsets, Lanes(dt8) - 1);
+    offsets = detail::AndS(offsets, static_cast<uint8_t>(Lanes(dt8) - 1));
   }
   const auto out = TableLookupLanes(vmt, Add(vmi, offsets));
   return BitCast(di, detail::ChangeLMUL(di8, out));
@@ -2381,8 +2637,8 @@ HWY_API V ShiftLeftLanes(const D d, cons
   using TI = TFromD<decltype(di)>;
   const auto shifted = detail::SlideUp(v, v, kLanes);
   // Match x86 semantics by zeroing lower lanes in 128-bit blocks
-  const auto idx_mod =
-      detail::AndS(detail::Iota0(di), detail::LanesPerBlock(di) - 1);
+  const auto idx_mod = detail::AndS(
+      detail::Iota0(di), static_cast<TI>(detail::LanesPerBlock(di) - 1));
   const auto clear = detail::LtS(BitCast(di, idx_mod), static_cast<TI>(kLanes));
   return IfThenZeroElse(clear, shifted);
 }
@@ -2419,7 +2675,8 @@ HWY_API V ShiftRightLanes(const Simd<T,
   const auto shifted = detail::SlideDown(v, v, kLanes);
   // Match x86 semantics by zeroing upper lanes in 128-bit blocks
   const size_t lpb = detail::LanesPerBlock(di);
-  const auto idx_mod = detail::AndS(detail::Iota0(di), lpb - 1);
+  const auto idx_mod =
+      detail::AndS(detail::Iota0(di), static_cast<TI>(lpb - 1));
   const auto keep =
       detail::LtS(BitCast(di, idx_mod), static_cast<TI>(lpb - kLanes));
   return IfThenElseZero(keep, shifted);
@@ -2438,9 +2695,10 @@ template <class D, class V>
 HWY_API V InterleaveLower(D d, const V a, const V b) {
   static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
   const RebindToUnsigned<decltype(d)> du;
+  using TU = TFromD<decltype(du)>;
   const auto i = detail::Iota0(du);
-  const auto idx_mod =
-      ShiftRight<1>(detail::AndS(i, detail::LanesPerBlock(du) - 1));
+  const auto idx_mod = ShiftRight<1>(
+      detail::AndS(i, static_cast<TU>(detail::LanesPerBlock(du) - 1)));
   const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
   const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
   return IfThenElse(is_even, TableLookupLanes(a, idx),
@@ -2458,11 +2716,12 @@ template <class D, class V>
 HWY_API V InterleaveUpper(const D d, const V a, const V b) {
   static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
   const RebindToUnsigned<decltype(d)> du;
+  using TU = TFromD<decltype(du)>;
   const size_t lpb = detail::LanesPerBlock(du);
   const auto i = detail::Iota0(du);
-  const auto idx_mod = ShiftRight<1>(detail::AndS(i, lpb - 1));
+  const auto idx_mod = ShiftRight<1>(detail::AndS(i, static_cast<TU>(lpb - 1)));
   const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
-  const auto idx = detail::AddS(idx_lower, lpb / 2);
+  const auto idx = detail::AddS(idx_lower, static_cast<TU>(lpb / 2));
   const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
   return IfThenElse(is_even, TableLookupLanes(a, idx),
                     TableLookupLanes(b, idx));
@@ -2552,7 +2811,7 @@ HWY_API VFromD<D> MaxOfLanes(D d, const
 // ------------------------------ PopulationCount (ShiftRight)
 
 // Handles LMUL >= 2 or capped vectors, which generic_ops-inl cannot.
-template <typename V, class D = DFromV<V>, HWY_IF_LANES_ARE(uint8_t, V),
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
           hwy::EnableIf<Pow2(D()) < 1 || MaxLanes(D()) < 16>* = nullptr>
 HWY_API V PopulationCount(V v) {
   // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
@@ -2563,11 +2822,12 @@ HWY_API V PopulationCount(V v) {
 
 // ------------------------------ LoadDup128
 
-template <class D>
-HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
+template <class D, typename T = TFromD<D>>
+HWY_API VFromD<D> LoadDup128(D d, const T* const HWY_RESTRICT p) {
   const auto loaded = Load(d, p);
   // Broadcast the first block
-  const auto idx = detail::AndS(detail::Iota0(d), detail::LanesPerBlock(d) - 1);
+  const auto idx = detail::AndS(detail::Iota0(d),
+                                static_cast<T>(detail::LanesPerBlock(d) - 1));
   return TableLookupLanes(loaded, idx);
 }
 
@@ -2859,7 +3119,6 @@ HWY_API auto ReorderWidenMulAccumulate(S
 }
 
 // ------------------------------ Lt128
-
 template <class D>
 HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
   static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
@@ -2885,6 +3144,15 @@ HWY_INLINE MFromD<D> Lt128(D d, const VF
   return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
 }
 
+// ------------------------------ Lt128Upper
+template <class D>
+HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
+  // Replicate H to its neighbor.
+  return MaskFromVec(OddEven(ltHL, detail::Slide1Down(ltHL)));
+}
+
 // ------------------------------ Min128, Max128 (Lt128)
 
 template <class D>
@@ -2915,6 +3183,16 @@ HWY_INLINE VFromD<D> Max128(D /* tag */,
   return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo));
 }
 
+template <class D>
+HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, a, b), a, b);
+}
+
+template <class D>
+HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, b, a), a, b);
+}
+
 // ================================================== END MACROS
 namespace detail {  // for code folding
 #undef HWY_RVV_AVL
diff -pruN 0.17.0-11/hwy/ops/scalar-inl.h 1.0.0-2/hwy/ops/scalar-inl.h
--- 0.17.0-11/hwy/ops/scalar-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/scalar-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -128,6 +128,9 @@ HWY_API Vec1<T> Iota(const Sisd<T> /* ta
   return Vec1<T>(static_cast<T>(first));
 }
 
+template <class D>
+using VFromD = decltype(Zero(D()));
+
 // ================================================== LOGICAL
 
 // ------------------------------ Not
@@ -722,7 +725,7 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
   const TI rounded = static_cast<TI>(v.raw + bias);
   if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
   // Round to even
-  if ((rounded & 1) && std::abs(rounded - v.raw) == T(0.5)) {
+  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
     return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
   }
   return Vec1<T>(static_cast<T>(rounded));
@@ -1115,12 +1118,8 @@ HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /*
 }
 
 HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
-#if HWY_NATIVE_FLOAT16
   uint16_t bits16;
   CopyBytes<2>(&v.raw, &bits16);
-#else
-  const uint16_t bits16 = v.raw.bits;
-#endif
   const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
   const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
   const uint32_t mantissa = bits16 & 0x3FF;
@@ -1158,12 +1157,8 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<fl
   // Tiny or zero => zero.
   Vec1<float16_t> out;
   if (exp < -24) {
-#if HWY_NATIVE_FLOAT16
     const uint16_t zero = 0;
     CopyBytes<2>(&zero, &out.raw);
-#else
-    out.raw.bits = 0;
-#endif
     return out;
   }
 
@@ -1186,12 +1181,8 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<fl
   HWY_DASSERT(mantissa16 < 1024);
   const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
   HWY_DASSERT(bits16 < 0x10000);
-#if HWY_NATIVE_FLOAT16
   const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
   CopyBytes<2>(&narrowed, &out.raw);
-#else
-  out.raw.bits = static_cast<uint16_t>(bits16);
-#endif
   return out;
 }
 
@@ -1224,6 +1215,38 @@ HWY_API Vec1<uint8_t> U8FromU32(const Ve
   return DemoteTo(Sisd<uint8_t>(), v);
 }
 
+// ------------------------------ Truncations
+
+HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
+                                 const Vec1<uint64_t> v) {
+  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
+}
+
+HWY_API Vec1<uint16_t> TruncateTo(Sisd<uint16_t> /* tag */,
+                                  const Vec1<uint64_t> v) {
+  return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
+}
+
+HWY_API Vec1<uint32_t> TruncateTo(Sisd<uint32_t> /* tag */,
+                                  const Vec1<uint64_t> v) {
+  return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)};
+}
+
+HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
+                                 const Vec1<uint32_t> v) {
+  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
+}
+
+HWY_API Vec1<uint16_t> TruncateTo(Sisd<uint16_t> /* tag */,
+                                  const Vec1<uint32_t> v) {
+  return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
+}
+
+HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
+                                 const Vec1<uint16_t> v) {
+  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
+}
+
 // ================================================== COMBINE
 // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
 
@@ -1454,7 +1477,13 @@ struct CompressIsPartition {
 
 template <typename T>
 HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
-  // Upper lanes are undefined, so result is the same independent of mask.
+  // A single lane is already partitioned by definition.
+  return v;
+}
+
+template <typename T>
+HWY_API Vec1<T> CompressNot(Vec1<T> v, const Mask1<T> /* mask */) {
+  // A single lane is already partitioned by definition.
   return v;
 }
 
diff -pruN 0.17.0-11/hwy/ops/set_macros-inl.h 1.0.0-2/hwy/ops/set_macros-inl.h
--- 0.17.0-11/hwy/ops/set_macros-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/set_macros-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -227,17 +227,25 @@
 
 #define HWY_NAMESPACE N_NEON
 
-// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+// Can use pragmas instead of -march compiler flag
+#if HWY_HAVE_RUNTIME_DISPATCH
+#if HWY_ARCH_ARM_V7
+#define HWY_TARGET_STR "+neon-vfpv4"
+#else
+#define HWY_TARGET_STR "+crypto"
+#endif  // HWY_ARCH_ARM_V7
+#else
+// HWY_TARGET_STR remains undefined
+#endif
 
 //-----------------------------------------------------------------------------
 // SVE[2]
-#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE
+#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \
+    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
 
 // SVE only requires lane alignment, not natural alignment of the entire vector.
 #define HWY_ALIGN alignas(8)
 
-#define HWY_MAX_BYTES 256
-
 // Value ensures MaxLanes() is the tightest possible upper bound to reduce
 // overallocation.
 #define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
@@ -253,11 +261,28 @@
 
 #if HWY_TARGET == HWY_SVE2
 #define HWY_NAMESPACE N_SVE2
+#define HWY_MAX_BYTES 256
+#elif HWY_TARGET == HWY_SVE_256
+#define HWY_NAMESPACE N_SVE_256
+#define HWY_MAX_BYTES 32
+#elif HWY_TARGET == HWY_SVE2_128
+#define HWY_NAMESPACE N_SVE2_128
+#define HWY_MAX_BYTES 16
 #else
 #define HWY_NAMESPACE N_SVE
+#define HWY_MAX_BYTES 256
 #endif
 
+// Can use pragmas instead of -march compiler flag
+#if HWY_HAVE_RUNTIME_DISPATCH
+#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
+#define HWY_TARGET_STR "+sve2-aes"
+#else
+#define HWY_TARGET_STR "+sve"
+#endif
+#else
 // HWY_TARGET_STR remains undefined
+#endif
 
 //-----------------------------------------------------------------------------
 // WASM
@@ -281,8 +306,8 @@
 #define HWY_TARGET_STR "simd128"
 
 //-----------------------------------------------------------------------------
-// WASM2
-#elif HWY_TARGET == HWY_WASM2
+// WASM_EMU256
+#elif HWY_TARGET == HWY_WASM_EMU256
 
 #define HWY_ALIGN alignas(32)
 #define HWY_MAX_BYTES 32
@@ -297,7 +322,7 @@
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
-#define HWY_NAMESPACE N_WASM2
+#define HWY_NAMESPACE N_WASM_EMU256
 
 #define HWY_TARGET_STR "simd128"
 
@@ -324,7 +349,7 @@
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
-#if defined(__riscv_zfh)
+#if defined(__riscv_zvfh)
 #define HWY_HAVE_FLOAT16 1
 #else
 #define HWY_HAVE_FLOAT16 0
diff -pruN 0.17.0-11/hwy/ops/shared-inl.h 1.0.0-2/hwy/ops/shared-inl.h
--- 0.17.0-11/hwy/ops/shared-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/shared-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -99,21 +99,21 @@ struct Simd {
 
 namespace detail {
 
-#if HWY_HAVE_SCALABLE
-
 template <typename T, size_t N, int kPow2>
 constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
   return N == HWY_LANES(T) && kPow2 == 0;
 }
 
-#endif
-
 // Returns the number of lanes (possibly zero) after applying a shift:
 // - 0: no change;
 // - [1,3]: a group of 2,4,8 [fractional] vectors;
 // - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
 constexpr size_t ScaleByPower(size_t N, int pow2) {
+#if HWY_TARGET == HWY_RVV
   return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
+#else
+  return pow2 >= 0 ? N : (N >> (-pow2));
+#endif
 }
 
 // Struct wrappers enable validation of arguments via static_assert.
@@ -241,17 +241,13 @@ using Full128 = Simd<T, 16 / sizeof(T),
 #define HWY_IF_GE128_D(D) \
   hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
 
-// Same, but with a vector argument.
+// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
 #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
 #define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV<V>, bytes)
 
-// For implementing functions for a specific type.
-// IsSame<...>() in template arguments is broken on MSVC2015.
-#define HWY_IF_LANES_ARE(T, V) EnableIf<IsSameT<T, TFromV<V>>::value>* = nullptr
-
 template <class D>
 HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
   return D::kPrivatePow2;
@@ -301,8 +297,7 @@ HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes
 // We therefore pass by const& only on GCC and (Windows or ARM64). This alias
 // must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
 // and possibly also other functions that are not inlined.
-#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
-    ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64)
+#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
 template <class V>
 using VecArg = const V&;
 #else
diff -pruN 0.17.0-11/hwy/ops/wasm_128-inl.h 1.0.0-2/hwy/ops/wasm_128-inl.h
--- 0.17.0-11/hwy/ops/wasm_128-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/wasm_128-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -60,7 +60,7 @@ struct Raw128<float> {
   using type = __f32x4;
 };
 
-} // namespace detail
+}  // namespace detail
 
 template <typename T, size_t N = 16 / sizeof(T)>
 class Vec128 {
@@ -3296,6 +3296,70 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
       wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
 }
 
+// ------------------------------ Truncations
+
+template <typename From, typename To, HWY_IF_UNSIGNED(From),
+          HWY_IF_UNSIGNED(To),
+          hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
+HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
+                                 const Vec128<From, 1> v) {
+  const Repartition<To, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  return Vec128<To, 1>{v1.raw};
+}
+
+HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
+                                      const Vec128<uint64_t> v) {
+  const Full128<uint8_t> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = ConcatEven(d, v1, v1);
+  const auto v4 = ConcatEven(d, v2, v2);
+  return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4))));
+}
+
+HWY_API Vec128<uint16_t, 2> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
+                                       const Vec128<uint64_t> v) {
+  const Full128<uint16_t> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = ConcatEven(d, v1, v1);
+  return LowerHalf(LowerHalf(ConcatEven(d, v2, v2)));
+}
+
+HWY_API Vec128<uint32_t, 2> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
+                                       const Vec128<uint64_t> v) {
+  const Full128<uint32_t> d;
+  const auto v1 = BitCast(d, v);
+  return LowerHalf(ConcatEven(d, v1, v1));
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint32_t, N> v) {
+  const Full128<uint8_t> d;
+  const auto v1 = Vec128<uint8_t>{v.raw};
+  const auto v2 = ConcatEven(d, v1, v1);
+  const auto v3 = ConcatEven(d, v2, v2);
+  return Vec128<uint8_t, N>{v3.raw};
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+                                       const Vec128<uint32_t, N> v) {
+  const Full128<uint16_t> d;
+  const auto v1 = Vec128<uint16_t>{v.raw};
+  const auto v2 = ConcatEven(d, v1, v1);
+  return Vec128<uint16_t, N>{v2.raw};
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint16_t, N> v) {
+  const Full128<uint8_t> d;
+  const auto v1 = Vec128<uint8_t>{v.raw};
+  const auto v2 = ConcatEven(d, v1, v1);
+  return Vec128<uint8_t, N>{v2.raw};
+}
+
 // ------------------------------ Convert i32 <=> f32 (Round)
 
 template <size_t N>
@@ -3625,8 +3689,8 @@ HWY_API intptr_t FindFirstTrue(const Sim
 
 namespace detail {
 
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 256);
   const Simd<T, N, 0> d;
   const Rebind<uint8_t, decltype(d)> d8;
@@ -3638,6 +3702,7 @@ HWY_INLINE Vec128<T, N> Idx16x8FromBits(
   // with the doubling baked into the table. Unpacking nibbles is likely more
   // costly than the higher cache footprint from storing bytes.
   alignas(16) constexpr uint8_t table[256 * 8] = {
+      // PrintCompress16x8Tables
       0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
@@ -3772,12 +3837,161 @@ HWY_INLINE Vec128<T, N> Idx16x8FromBits(
   return BitCast(d, pairs + Set(du, 0x0100));
 }
 
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 256);
+  const Simd<T, N, 0> d;
+  const Rebind<uint8_t, decltype(d)> d8;
+  const Simd<uint16_t, N, 0> du;
+
+  // We need byte indices for TableLookupBytes (one vector's worth for each of
+  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
+  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
+  // with the doubling baked into the table. Unpacking nibbles is likely more
+  // costly than the higher cache footprint from storing bytes.
+  alignas(16) constexpr uint8_t table[256 * 8] = {
+      // PrintCompressNot16x8Tables
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
+      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
+      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
+      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
+      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
+      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
+      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
+      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
+      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
+      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
+      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
+      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
+      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
+      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
+      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
+      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
+      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
+      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
+      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
+      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
+      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
+      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
+      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
+      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
+      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
+      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
+      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
+      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
+      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
+      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
+      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
+      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
+      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
+      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
+      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
+      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
+      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
+      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
+      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
+      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
+      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
+      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
+      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
+      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
+      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
+      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
+      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
+      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
+      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
+      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
+      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
+      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
+      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
+      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
+      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
+      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
+      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
+      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
+      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
+      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
+      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
+      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
+      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
+      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
+      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
+      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
+      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
+      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
+      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
+      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
+      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
+      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
+      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
+      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
+      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
+      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
+      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
+      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
+      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
+      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
+      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
+      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
+      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
+      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
+      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
+      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
+      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
+      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
+      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
+      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
+      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
+      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
+      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
+      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
+      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
+      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
+      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
+      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
+      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
+      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
+      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
+      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
+      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
+      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
+      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
+      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
+      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
+      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
+      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
+      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
+      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
+      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
+      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
+      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
+      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
+      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
+      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
+      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
+      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
+      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
+      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
+
+  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
+  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+  return BitCast(d, pairs + Set(du, 0x0100));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);
 
   // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[16 * 16] = {
+  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
+      // PrintCompress32x4Tables
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
@@ -3796,15 +4010,43 @@ HWY_INLINE Vec128<T, N> Idx32x4FromBits(
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 16);
+
+  // There are only 4 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
+      // PrintCompressNot32x4Tables
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
+      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
+      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+      12, 13, 14, 15};
+  const Simd<T, N, 0> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 4);
 
   // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[4 * 16] = {
+  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
+      // PrintCompress64x2Tables
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
@@ -3812,34 +4054,40 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(
 
   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
-// Helper functions called by both Compress and CompressStore - avoids a
-// redundant BitsFromMask in the latter.
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 4);
 
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
-                                 const uint64_t mask_bits) {
-  const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
-  const DFromV<decltype(v)> d;
-  const RebindToSigned<decltype(d)> di;
-  return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+  // There are only 2 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
+      // PrintCompressNot64x2Tables
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
+
+  const Simd<T, N, 0> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
+// Helper functions called by both Compress and CompressStore - avoids a
+// redundant BitsFromMask in the latter.
+
 template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
-                                 const uint64_t mask_bits) {
-  const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
+HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
+  const auto idx = detail::IdxFromBits<T, N>(mask_bits);
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }
 
 template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<8> /*tag*/, Vec128<T, N> v,
-                                 const uint64_t mask_bits) {
-  const auto idx = detail::Idx64x2FromBits<T, N>(mask_bits);
+HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
+  const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
@@ -3852,10 +4100,62 @@ struct CompressIsPartition {
   enum { value = 1 };
 };
 
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
+  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
+  const Full128<T> d;
+  const Vec128<T> m = VecFromMask(d, mask);
+  const Vec128<T> maskL = DupEven(m);
+  const Vec128<T> maskH = DupOdd(m);
+  const Vec128<T> swap = AndNot(maskL, maskH);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
+  return detail::Compress(v, detail::BitsFromMask(mask));
+}
+
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
+  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
+  const Full128<T> d;
+  const Vec128<T> m = VecFromMask(d, mask);
+  const Vec128<T> maskL = DupEven(m);
+  const Vec128<T> maskH = DupOdd(m);
+  const Vec128<T> swap = AndNot(maskH, maskL);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
+  // For partial vectors, we cannot pull the Not() into the table because
+  // BitsFromMask clears the upper bits.
+  if (N < 16 / sizeof(T)) {
+    return detail::Compress(v, detail::BitsFromMask(Not(mask)));
+  }
+  return detail::CompressNot(v, detail::BitsFromMask(mask));
+}
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+                                           Mask128<uint64_t> /* m */) {
+  return v;
 }
 
 // ------------------------------ CompressBits
@@ -3870,7 +4170,7 @@ HWY_API Vec128<T, N> CompressBits(Vec128
     mask_bits &= (1ull << N) - 1;
   }
 
-  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
+  return detail::Compress(v, mask_bits);
 }
 
 // ------------------------------ CompressStore
@@ -3878,7 +4178,7 @@ template <typename T, size_t N>
 HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
                              Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
   const uint64_t mask_bits = detail::BitsFromMask(mask);
-  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
+  const auto c = detail::Compress(v, mask_bits);
   StoreU(c, d, unaligned);
   return PopCount(mask_bits);
 }
@@ -3892,8 +4192,7 @@ HWY_API size_t CompressBlendedStore(Vec1
   using TU = TFromD<decltype(du)>;
   const uint64_t mask_bits = detail::BitsFromMask(m);
   const size_t count = PopCount(mask_bits);
-  const Vec128<TU, N> compressed =
-      detail::Compress(hwy::SizeTag<sizeof(T)>(), BitCast(du, v), mask_bits);
+  const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
   const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
   BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
   return count;
@@ -3912,7 +4211,7 @@ HWY_API size_t CompressBitsStore(Vec128<
     mask_bits &= (1ull << N) - 1;
   }
 
-  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
+  const auto c = detail::Compress(v, mask_bits);
   StoreU(c, d, unaligned);
   return PopCount(mask_bits);
 }
@@ -4089,15 +4388,6 @@ HWY_API Vec128<T, N> MaxOfLanes(Simd<T,
 
 // ------------------------------ Lt128
 
-namespace detail {
-
-template <size_t kLanes, typename T, size_t N>
-Mask128<T, N> ShiftMaskLeft(Mask128<T, N> m) {
-  return MaskFromVec(ShiftLeftLanes<kLanes>(VecFromMask(Simd<T, N, 0>(), m)));
-}
-
-}  // namespace detail
-
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
                                Vec128<T, N> b) {
@@ -4116,14 +4406,21 @@ HWY_INLINE Mask128<T, N> Lt128(Simd<T, N
   //  1  0  0  1  |  1
   //  1  1  0  0  |  0
   const Mask128<T, N> eqHL = Eq(a, b);
-  const Mask128<T, N> ltHL = Lt(a, b);
+  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
   // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
   // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
-  // comparison result leftwards requires only 4.
-  const Mask128<T, N> ltLx = detail::ShiftMaskLeft<1>(ltHL);
-  const Mask128<T, N> outHx = Or(ltHL, And(eqHL, ltLx));
-  const Vec128<T, N> vecHx = VecFromMask(d, outHx);
-  return MaskFromVec(InterleaveUpper(d, vecHx, vecHx));
+  // comparison result leftwards requires only 4. IfThenElse compiles to the
+  // same code as OrAnd().
+  const Vec128<T, N> ltLx = DupEven(ltHL);
+  const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
+  return MaskFromVec(DupOdd(outHx));
+}
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+                                    Vec128<T, N> b) {
+  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
+  return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
 }
 
 // ------------------------------ Min128, Max128 (Lt128)
@@ -4136,7 +4433,17 @@ HWY_INLINE VFromD<D> Min128(D d, const V
 
 template <class D>
 HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128(d, a, b), b, a);
+  return IfThenElse(Lt128(d, b, a), a, b);
+}
+
+template <class D>
+HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, a, b), a, b);
+}
+
+template <class D>
+HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, b, a), a, b);
 }
 
 // ================================================== Operator wrapper
diff -pruN 0.17.0-11/hwy/ops/wasm_256-inl.h 1.0.0-2/hwy/ops/wasm_256-inl.h
--- 0.17.0-11/hwy/ops/wasm_256-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/wasm_256-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -2304,6 +2304,50 @@ HWY_API Vec256<uint8_t> U8FromU32(const
   return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
 }
 
+// ------------------------------ Truncations
+
+HWY_API Vec256<uint8_t, 4> TruncateTo(Simd<uint8_t, 4, 0> /* tag */,
+                                      const Vec256<uint64_t> v) {
+  return Vec256<uint8_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24,
+                                               0, 8, 16, 24, 0, 8, 16, 24, 0, 8,
+                                               16, 24)};
+}
+
+HWY_API Vec256<uint16_t, 4> TruncateTo(Simd<uint16_t, 4, 0> /* tag */,
+                                       const Vec256<uint64_t> v) {
+  return Vec256<uint16_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9,
+                                                16, 17, 24, 25, 0, 1, 8, 9, 16,
+                                                17, 24, 25)};
+}
+
+HWY_API Vec256<uint32_t, 4> TruncateTo(Simd<uint32_t, 4, 0> /* tag */,
+                                       const Vec256<uint64_t> v) {
+  return Vec256<uint32_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3,
+                                                8, 9, 10, 11, 16, 17, 18, 19,
+                                                24, 25, 26, 27)};
+}
+
+HWY_API Vec256<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> /* tag */,
+                                      const Vec256<uint32_t> v) {
+  return Vec256<uint8_t, 8>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12,
+                                               16, 20, 24, 28, 0, 4, 8, 12, 16,
+                                               20, 24, 28)};
+}
+
+HWY_API Vec256<uint16_t, 8> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
+                                       const Vec256<uint32_t> v) {
+  return Vec256<uint16_t, 8>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5,
+                                                8, 9, 12, 13, 16, 17, 20, 21,
+                                                24, 25, 28, 29)};
+}
+
+HWY_API Vec256<uint8_t, 16> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
+                                       const Vec256<uint16_t> v) {
+  return Vec256<uint8_t, 16>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6,
+                                                8, 10, 12, 14, 16, 18, 20, 22,
+                                                24, 26, 28, 30)};
+}
+
 // ------------------------------ Convert i32 <=> f32 (Round)
 
 HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
@@ -2758,6 +2802,18 @@ HWY_API Vec256<T> Compress(Vec256<T> v,
   return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
 }
 
+// ------------------------------ CompressNot
+template <typename T>
+HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
+  return Compress(v, Not(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
+                                           Mask256<uint64_t> mask) {
+  HWY_ASSERT(0);
+}
+
 // ------------------------------ CompressBits
 
 template <typename T>
@@ -2952,11 +3008,20 @@ template <typename T>
 HWY_INLINE Mask256<T> Lt128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
 
 template <typename T>
+HWY_INLINE Mask256<T> Lt128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
 HWY_INLINE Vec256<T> Min128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
 
 template <typename T>
 HWY_INLINE Vec256<T> Max128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
 
+template <typename T>
+HWY_INLINE Vec256<T> Min128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
+HWY_INLINE Vec256<T> Max128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
diff -pruN 0.17.0-11/hwy/ops/x86_128-inl.h 1.0.0-2/hwy/ops/x86_128-inl.h
--- 0.17.0-11/hwy/ops/x86_128-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/x86_128-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -35,15 +35,6 @@
 #include <sanitizer/msan_interface.h>
 #endif
 
-// Clang 3.9 generates VINSERTF128 instead of the desired VBROADCASTF128,
-// which would free up port5. However, inline assembly isn't supported on
-// MSVC, results in incorrect output on GCC 8.3, and raises "invalid output size
-// for constraint" errors on Clang (https://gcc.godbolt.org/z/-Jt_-F), hence we
-// disable it.
-#ifndef HWY_LOADDUP_ASM
-#define HWY_LOADDUP_ASM 0
-#endif
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@@ -143,7 +134,7 @@ struct RawMask128<8> {
 
 }  // namespace detail
 
-template <typename T, size_t N>
+template <typename T, size_t N = 16 / sizeof(T)>
 struct Mask128 {
   using Raw = typename detail::RawMask128<sizeof(T)>::type;
 
@@ -583,16 +574,26 @@ HWY_API Vec128<T, N> PopulationCount(Vec
 
 // ------------------------------ Neg
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Neg(hwy::FloatTag /*tag*/, const Vec128<T, N> v) {
   return Xor(v, SignBit(DFromV<decltype(v)>()));
 }
 
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, const Vec128<T, N> v) {
   return Zero(DFromV<decltype(v)>()) - v;
 }
 
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Neg(const Vec128<T, N> v) {
+  return detail::Neg(hwy::IsFloatTag<T>(), v);
+}
+
 // ------------------------------ Abs
 
 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
@@ -813,7 +814,7 @@ HWY_API Vec128<double, N> IfThenZeroElse
 
 // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
-#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \
+#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
     HWY_COMPILER_CLANG >= 800
 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
 #else
@@ -996,6 +997,7 @@ HWY_API Mask128<T, N> Xor(const Mask128<
 template <typename T, size_t N>
 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
   // Flip only the valid bits.
+  // TODO(janwas): use _knot intrinsics if N >= 8.
   return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
 }
 
@@ -1206,8 +1208,9 @@ HWY_API VI TableLookupBytesOr0(const V b
 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
 
 // Swap 32-bit halves in 64-bit halves.
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
+  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
 }
@@ -1688,7 +1691,7 @@ HWY_API Mask128<double, N> operator==(co
 
 // ------------------------------ Inequality
 
-// This cannot have T as a template argument, otherwise it is not more 
+// This cannot have T as a template argument, otherwise it is not more
 // specialized than rewritten operator== in C++20, leading to compile
 // errors: https://gcc.godbolt.org/z/xsrPhPvPT.
 template <size_t N>
@@ -1745,57 +1748,71 @@ HWY_API Mask128<double, N> operator!=(co
 
 // ------------------------------ Strict inequality
 
-// Signed/float <
+namespace detail {
+
 template <size_t N>
-HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
+HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a,
+                                 Vec128<int8_t, N> b) {
   return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
 }
 template <size_t N>
-HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
-                                      Vec128<int16_t, N> b) {
+HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a,
+                                  Vec128<int16_t, N> b) {
   return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
 }
 template <size_t N>
-HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
-                                      Vec128<int32_t, N> b) {
+HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a,
+                                  Vec128<int32_t, N> b) {
   return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
 }
 
-template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
-HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
+template <size_t N>
+HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/,
+                                  const Vec128<int64_t, N> a,
+                                  const Vec128<int64_t, N> b) {
+#if HWY_TARGET == HWY_SSSE3
+  // See https://stackoverflow.com/questions/65166174/:
+  const Simd<int64_t, N, 0> d;
+  const RepartitionToNarrow<decltype(d)> d32;
+  const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw};
+  const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw};
+  // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper:
+  // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0.
+  const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw;
+  // Duplicate upper to lower half.
+  return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
+#else
+  return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};  // SSE4.2
+#endif
+}
+
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a,
+                            Vec128<T, N> b) {
   const DFromV<decltype(a)> du;
   const RebindToSigned<decltype(du)> di;
   const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
-  return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
+  const auto sa = BitCast(di, Xor(a, msb));
+  const auto sb = BitCast(di, Xor(b, msb));
+  return RebindMask(du, Gt(hwy::SignedTag(), sa, sb));
 }
 
 template <size_t N>
-HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
+HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a,
+                                Vec128<float, N> b) {
   return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
 }
 template <size_t N>
-HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
+HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a,
+                                 Vec128<double, N> b) {
   return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
 }
 
-template <size_t N>
-HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
-                                      const Vec128<int64_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  // If the upper half is less than or greater, this is the answer.
-  const __m128i m_gt = _mm_cmpgt_epi32(a.raw, b.raw);
+}  // namespace detail
 
-  // Otherwise, the lower half decides.
-  const __m128i m_eq = _mm_cmpeq_epi32(a.raw, b.raw);
-  const __m128i lo_in_hi = _mm_shuffle_epi32(m_gt, _MM_SHUFFLE(2, 2, 0, 0));
-  const __m128i lo_gt = _mm_and_si128(m_eq, lo_in_hi);
-
-  const __m128i gt = _mm_or_si128(lo_gt, m_gt);
-  // Copy result in upper 32 bits to lower 32 bits.
-  return Mask128<int64_t, N>{_mm_shuffle_epi32(gt, _MM_SHUFFLE(3, 3, 1, 1))};
-#else
-  return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};  // SSE4.2
-#endif
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
+  return detail::Gt(hwy::TypeTag<T>(), a, b);
 }
 
 // ------------------------------ Weak inequality
@@ -2662,8 +2679,9 @@ HWY_API Vec128<int64_t, N> ShiftRight(co
 }
 
 // ------------------------------ ZeroIfNegative (BroadcastSignBit)
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only works for float");
   const DFromV<decltype(v)> d;
 #if HWY_TARGET == HWY_SSSE3
   const RebindToSigned<decltype(d)> di;
@@ -4489,17 +4507,29 @@ HWY_API Vec128<T, N> Combine(Simd<T, N,
 
 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
 
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T> ZeroExtendVector(Full128<T> /* tag */, Vec64<T> lo) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T>
+HWY_INLINE Vec128<T> ZeroExtendVector(hwy::NonFloatTag /*tag*/,
+                                      Full128<T> /* d */, Vec64<T> lo) {
   return Vec128<T>{_mm_move_epi64(lo.raw)};
 }
 
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec64<T> lo) {
+template <typename T>
+HWY_INLINE Vec128<T> ZeroExtendVector(hwy::FloatTag /*tag*/, Full128<T> d,
+                                      Vec64<T> lo) {
   const RebindToUnsigned<decltype(d)> du;
   return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo)));
 }
 
+}  // namespace detail
+
+template <typename T>
+HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec64<T> lo) {
+  return detail::ZeroExtendVector(hwy::IsFloatTag<T>(), d, lo);
+}
+
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
   return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
@@ -4633,10 +4663,11 @@ HWY_API Vec32<T> ConcatOdd(Simd<T, 4, 0>
 // 16-bit full
 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const Repartition<uint32_t, decltype(d)> dw;
-  // Right-shift 16 bits per u32 so we can pack.
-  const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
-  const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
+  // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
+  // 0xFFFF8000, which correctly saturates to 0x8000.
+  const Repartition<int32_t, decltype(d)> dw;
+  const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
+  const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
   return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
 }
 
@@ -4713,12 +4744,22 @@ HWY_API Vec32<T> ConcatEven(Simd<T, 4, 0
 // 16-bit full
 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const Repartition<uint32_t, decltype(d)> dw;
+#if HWY_TARGET <= HWY_SSE4
   // Isolate lower 16 bits per u32 so we can pack.
+  const Repartition<uint32_t, decltype(d)> dw;
   const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
   const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
   const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
-  return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
+  return Vec128<T>{_mm_packus_epi32(uL.raw, uH.raw)};
+#else
+  // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two
+  // inputs, then concatenate them.
+  alignas(16) const T kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
+  const Vec128<T> shuf = BitCast(d, Load(d, kCompactEvenU16));
+  const Vec128<T> L = TableLookupBytes(lo, shuf);
+  const Vec128<T> H = TableLookupBytes(hi, shuf);
+  return ConcatLowerLower(d, H, L);
+#endif
 }
 
 // 16-bit x4
@@ -4875,8 +4916,8 @@ HWY_API Vec128<T, N> SwapAdjacentBlocks(
 // two from loading float exponents, which is considerably faster (according
 // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
 
-#if HWY_TARGET > HWY_AVX3  // AVX2 or older
 namespace detail {
+#if HWY_TARGET > HWY_AVX3  // AVX2 or older
 
 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
@@ -4909,39 +4950,38 @@ HWY_INLINE Vec128<MakeUnsigned<T>, N> Po
   return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
 }
 
-}  // namespace detail
 #endif  // HWY_TARGET > HWY_AVX3
 
 template <size_t N>
-HWY_API Vec128<uint16_t, N> operator<<(const Vec128<uint16_t, N> v,
-                                       const Vec128<uint16_t, N> bits) {
+HWY_API Vec128<uint16_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, N> v,
+                                Vec128<uint16_t, N> bits) {
 #if HWY_TARGET <= HWY_AVX3
   return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
 #else
-  return v * detail::Pow2(bits);
+  return v * Pow2(bits);
 #endif
 }
-HWY_API Vec128<uint16_t, 1> operator<<(const Vec128<uint16_t, 1> v,
-                                       const Vec128<uint16_t, 1> bits) {
+HWY_API Vec128<uint16_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, 1> v,
+                                Vec128<uint16_t, 1> bits) {
   return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)};
 }
 
 template <size_t N>
-HWY_API Vec128<uint32_t, N> operator<<(const Vec128<uint32_t, N> v,
-                                       const Vec128<uint32_t, N> bits) {
+HWY_API Vec128<uint32_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, N> v,
+                                Vec128<uint32_t, N> bits) {
 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return v * detail::Pow2(bits);
+  return v * Pow2(bits);
 #else
   return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
 #endif
 }
-HWY_API Vec128<uint32_t, 1> operator<<(const Vec128<uint32_t, 1> v,
-                                       const Vec128<uint32_t, 1> bits) {
+HWY_API Vec128<uint32_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, 1> v,
+                                const Vec128<uint32_t, 1> bits) {
   return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)};
 }
 
-HWY_API Vec128<uint64_t> operator<<(const Vec128<uint64_t> v,
-                                    const Vec128<uint64_t> bits) {
+HWY_API Vec128<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint64_t> v,
+                             Vec128<uint64_t> bits) {
 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
   // Individual shifts and combine
   const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
@@ -4952,17 +4992,26 @@ HWY_API Vec128<uint64_t> operator<<(cons
   return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
 #endif
 }
-HWY_API Vec64<uint64_t> operator<<(const Vec64<uint64_t> v,
-                                   const Vec64<uint64_t> bits) {
+HWY_API Vec64<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec64<uint64_t> v,
+                            Vec64<uint64_t> bits) {
   return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)};
 }
 
 // Signed left shift is the same as unsigned.
-template <typename T, size_t N, HWY_IF_SIGNED(T)>
-HWY_API Vec128<T, N> operator<<(const Vec128<T, N> v, const Vec128<T, N> bits) {
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
+                         Vec128<T, N> bits) {
   const DFromV<decltype(v)> di;
   const RebindToUnsigned<decltype(di)> du;
-  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
+  return BitCast(di,
+                 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
+}
+
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
+  return detail::Shl(hwy::TypeTag<T>(), v, bits);
 }
 
 // ------------------------------ Shr (mul, mask, BroadcastSignBit)
@@ -5433,7 +5482,7 @@ HWY_INLINE auto FixConversionOverflow(DI
   //   ++: normal >0                       : OK
   const auto converted = VFromD<DI>{converted_raw};
   const auto sign_wrong = AndNot(BitCast(di, original), converted);
-#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
+#if HWY_COMPILER_GCC_ACTUAL
   // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also
   // Add() if using that instead. Work around with one more instruction.
   const RebindToUnsigned<DI> du;
@@ -5466,6 +5515,65 @@ HWY_API Vec128<uint8_t, N> U8FromU32(con
   return LowerHalf(LowerHalf(BitCast(d8, quad)));
 }
 
+// ------------------------------ Truncations
+
+template <typename From, typename To,
+          hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
+HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
+                                 const Vec128<From, 1> v) {
+  static_assert(!IsSigned<To>() && !IsSigned<From>(), "Unsigned only");
+  const Repartition<To, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  return Vec128<To, 1>{v1.raw};
+}
+
+HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
+                                      const Vec128<uint64_t, 2> v) {
+  const Full128<uint8_t> d8;
+  alignas(16) static constexpr uint8_t kMap[16] = {0, 8, 0, 8, 0, 8, 0, 8,
+                                                   0, 8, 0, 8, 0, 8, 0, 8};
+  return LowerHalf(LowerHalf(LowerHalf(TableLookupBytes(v, Load(d8, kMap)))));
+}
+
+HWY_API Vec128<uint16_t, 2> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
+                                       const Vec128<uint64_t, 2> v) {
+  const Full128<uint16_t> d16;
+  alignas(16) static constexpr uint16_t kMap[8] = {
+      0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u};
+  return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d16, kMap))));
+}
+
+HWY_API Vec128<uint32_t, 2> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
+                                       const Vec128<uint64_t, 2> v) {
+  return Vec128<uint32_t, 2>{_mm_shuffle_epi32(v.raw, 0x88)};
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint32_t, N> v) {
+  const Repartition<uint8_t, DFromV<decltype(v)>> d;
+  alignas(16) static constexpr uint8_t kMap[16] = {
+      0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu,
+      0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu};
+  return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kMap))));
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+                                       const Vec128<uint32_t, N> v) {
+  const Repartition<uint16_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  return LowerHalf(ConcatEven(d, v1, v1));
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint16_t, N> v) {
+  const Repartition<uint8_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  return LowerHalf(ConcatEven(d, v1, v1));
+}
+
 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
 
 template <size_t N>
@@ -5573,8 +5681,9 @@ HWY_API Vec128<int32_t, N> NearestInt(co
 #if HWY_TARGET == HWY_SSSE3
 
 // Toward nearest integer, ties to even
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Round(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   // Rely on rounding after addition with a large value such that no mantissa
   // bits remain (assuming the current mode is nearest-even). We may need a
   // compiler flag for precise floating-point to prevent "optimizing" this out.
@@ -5592,16 +5701,18 @@ namespace detail {
 // Truncating to integer and converting back to float is correct except when the
 // input magnitude is large, in which case the input was already an integer
 // (because mantissa >> exponent is zero).
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_INLINE Mask128<T, N> UseInt(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   return Abs(v) < Set(Simd<T, N, 0>(), MantissaEnd<T>());
 }
 
 }  // namespace detail
 
 // Toward zero, aka truncate
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> df;
   const RebindToSigned<decltype(df)> di;
 
@@ -5612,8 +5723,9 @@ HWY_API Vec128<T, N> Trunc(const Vec128<
 }
 
 // Toward +infinity, aka ceiling
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> df;
   const RebindToSigned<decltype(df)> di;
 
@@ -5627,8 +5739,9 @@ HWY_API Vec128<T, N> Ceil(const Vec128<T
 }
 
 // Toward -infinity, aka floor
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> df;
   const RebindToSigned<decltype(df)> di;
 
@@ -5737,8 +5850,9 @@ HWY_API Mask128<double, N> IsFinite(cons
 
 #else
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> d;
   const RebindToSigned<decltype(d)> di;
   const VFromD<decltype(di)> vi = BitCast(di, v);
@@ -5747,8 +5861,9 @@ HWY_API Mask128<T, N> IsInf(const Vec128
 }
 
 // Returns whether normal/subnormal/zero.
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> d;
   const RebindToUnsigned<decltype(d)> du;
   const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
@@ -5983,6 +6098,12 @@ HWY_INLINE Vec128<uint16_t> IndicesForCo
 }  // namespace detail
 #endif  // HWY_TARGET != HWY_AVX3_DL
 
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
   const Simd<T, N, 0> d;
@@ -6003,28 +6124,47 @@ HWY_API Vec128<T, N> Compress(Vec128<T,
   return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
 }
 
-template <size_t N>
+template <size_t N, HWY_IF_GE64(float, N)>
 HWY_API Vec128<float, N> Compress(Vec128<float, N> v, Mask128<float, N> mask) {
   return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
 }
 
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
   HWY_DASSERT(mask.raw < 4);
 
   // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[64] = {
+  alignas(16) constexpr uint8_t u8_indices[64] = {
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
 
-  const Simd<T, N, 0> d;
+  const Full128<T> d;
   const Repartition<uint8_t, decltype(d)> d8;
-  const auto index = Load(d8, packed_array + 16 * mask.raw);
+  const auto index = Load(d8, u8_indices + 16 * mask.raw);
   return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
 }
 
+// ------------------------------ CompressNot (Compress)
+
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
+  return Compress(v, Not(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+                                           Mask128<uint64_t> /* m */) {
+  return v;
+}
+
 // ------------------------------ CompressBits (LoadMaskBits)
 
 template <typename T, size_t N>
@@ -6313,6 +6453,7 @@ HWY_API intptr_t FindFirstTrue(const Sim
 
 namespace detail {
 
+// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 256);
@@ -6328,6 +6469,7 @@ HWY_INLINE Vec128<T, N> IndicesFromBits(
   // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
   // is likely more costly than the higher cache footprint from storing bytes.
   alignas(16) constexpr uint8_t table[2048] = {
+      // PrintCompress16x8Tables
       0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
@@ -6462,12 +6604,164 @@ HWY_INLINE Vec128<T, N> IndicesFromBits(
   return BitCast(d, pairs + Set(du, 0x0100));
 }
 
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
+                                           uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 256);
+  const Rebind<uint8_t, decltype(d)> d8;
+  const Simd<uint16_t, N, 0> du;
+
+  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
+  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
+  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
+  // store lane indices and convert to byte indices (2*lane + 0..1), with the
+  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
+  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
+  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
+  // is likely more costly than the higher cache footprint from storing bytes.
+  alignas(16) constexpr uint8_t table[2048] = {
+      // PrintCompressNot16x8Tables
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
+      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
+      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
+      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
+      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
+      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
+      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
+      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
+      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
+      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
+      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
+      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
+      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
+      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
+      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
+      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
+      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
+      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
+      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
+      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
+      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
+      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
+      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
+      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
+      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
+      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
+      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
+      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
+      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
+      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
+      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
+      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
+      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
+      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
+      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
+      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
+      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
+      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
+      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
+      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
+      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
+      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
+      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
+      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
+      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
+      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
+      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
+      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
+      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
+      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
+      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
+      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
+      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
+      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
+      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
+      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
+      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
+      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
+      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
+      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
+      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
+      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
+      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
+      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
+      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
+      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
+      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
+      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
+      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
+      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
+      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
+      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
+      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
+      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
+      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
+      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
+      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
+      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
+      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
+      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
+      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
+      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
+      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
+      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
+      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
+      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
+      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
+      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
+      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
+      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
+      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
+      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
+      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
+      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
+      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
+      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
+      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
+      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
+      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
+      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
+      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
+      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
+      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
+      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
+      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
+      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
+      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
+      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
+      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
+      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
+      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
+      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
+      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
+      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
+      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
+      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
+      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
+      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
+      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
+      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
+      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
+
+  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
+  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+  return BitCast(d, pairs + Set(du, 0x0100));
+}
+
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);
 
   // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[256] = {
+  alignas(16) constexpr uint8_t u8_indices[256] = {
+      // PrintCompress32x4Tables
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
@@ -6486,7 +6780,35 @@ HWY_INLINE Vec128<T, N> IndicesFromBits(
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
 
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
+HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
+                                           uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 16);
+
+  // There are only 4 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[256] = {
+      // PrintCompressNot32x4Tables
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
+      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
+      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+      12, 13, 14, 15};
+
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
@@ -6494,14 +6816,32 @@ HWY_INLINE Vec128<T, N> IndicesFromBits(
   HWY_DASSERT(mask_bits < 4);
 
   // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[64] = {
+  alignas(16) constexpr uint8_t u8_indices[64] = {
+      // PrintCompress64x2Tables
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
 
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
+HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
+                                           uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 4);
+
+  // There are only 2 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[64] = {
+      // PrintCompressNot64x2Tables
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
+
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
 template <typename T, size_t N>
@@ -6514,11 +6854,75 @@ HWY_API Vec128<T, N> CompressBits(Vec128
   return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
 }
 
+template <typename T, size_t N>
+HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
+  const Simd<T, N, 0> d;
+  const RebindToUnsigned<decltype(d)> du;
+
+  HWY_DASSERT(mask_bits < (1ull << N));
+  const auto indices = BitCast(du, detail::IndicesFromNotBits(d, mask_bits));
+  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
+}
+
 }  // namespace detail
 
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> m) {
-  return detail::CompressBits(v, detail::BitsFromMask(m));
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
+  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
+  const Full128<T> d;
+  const Vec128<T> m = VecFromMask(d, mask);
+  const Vec128<T> maskL = DupEven(m);
+  const Vec128<T> maskH = DupOdd(m);
+  const Vec128<T> swap = AndNot(maskL, maskH);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
+  return detail::CompressBits(v, detail::BitsFromMask(mask));
+}
+
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
+  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
+  const Full128<T> d;
+  const Vec128<T> m = VecFromMask(d, mask);
+  const Vec128<T> maskL = DupEven(m);
+  const Vec128<T> maskH = DupOdd(m);
+  const Vec128<T> swap = AndNot(maskH, maskL);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
+  // For partial vectors, we cannot pull the Not() into the table because
+  // BitsFromMask clears the upper bits.
+  if (N < 16 / sizeof(T)) {
+    return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
+  }
+  return detail::CompressNotBits(v, detail::BitsFromMask(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+                                           Mask128<uint64_t> /* m */) {
+  return v;
 }
 
 template <typename T, size_t N>
@@ -6756,13 +7160,21 @@ HWY_INLINE V Lt128Vec(const D d, const V
   //  1  0  0  0  |  0
   //  1  0  0  1  |  1
   //  1  1  0  0  |  0
-  const V eqHL = VecFromMask(d, Eq(a, b));
+  const auto eqHL = Eq(a, b);
   const V ltHL = VecFromMask(d, Lt(a, b));
   const V ltLX = ShiftLeftLanes<1>(ltHL);
-  const V vecHx = OrAnd(ltHL, eqHL, ltLX);
+  const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
   return InterleaveUpper(d, vecHx, vecHx);
 }
 
+template <class D, class V = VFromD<D>>
+HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) {
+  // No specialization required for AVX-512: Mask <-> Vec is fast, and
+  // copying mask bits to their neighbor seems infeasible.
+  const V ltHL = VecFromMask(d, Lt(a, b));
+  return InterleaveUpper(d, ltHL, ltHL);
+}
+
 }  // namespace detail
 
 template <class D, class V = VFromD<D>>
@@ -6770,6 +7182,11 @@ HWY_API MFromD<D> Lt128(D d, const V a,
   return MaskFromVec(detail::Lt128Vec(d, a, b));
 }
 
+template <class D, class V = VFromD<D>>
+HWY_API MFromD<D> Lt128Upper(D d, const V a, const V b) {
+  return MaskFromVec(detail::Lt128UpperVec(d, a, b));
+}
+
 // ------------------------------ Min128, Max128 (Lt128)
 
 // Avoids the extra MaskFromVec in Lt128.
@@ -6780,7 +7197,17 @@ HWY_API V Min128(D d, const V a, const V
 
 template <class D, class V = VFromD<D>>
 HWY_API V Max128(D d, const V a, const V b) {
-  return IfVecThenElse(detail::Lt128Vec(d, a, b), b, a);
+  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
+}
+
+template <class D, class V = VFromD<D>>
+HWY_API V Min128Upper(D d, const V a, const V b) {
+  return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
+}
+
+template <class D, class V = VFromD<D>>
+HWY_API V Max128Upper(D d, const V a, const V b) {
+  return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
 }
 
 // ================================================== Operator wrapper
diff -pruN 0.17.0-11/hwy/ops/x86_256-inl.h 1.0.0-2/hwy/ops/x86_256-inl.h
--- 0.17.0-11/hwy/ops/x86_256-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/x86_256-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -20,11 +20,20 @@
 // WARNING: most operations do not cross 128-bit block boundaries. In
 // particular, "Broadcast", pack and zip behavior may be surprising.
 
+// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL
+#include "hwy/base.h"
+
+// Avoid uninitialized warnings in GCC's avx512fintrin.h - see
+// https://github.com/google/highway/issues/710)
+HWY_DIAGNOSTICS(push)
+#if HWY_COMPILER_GCC_ACTUAL
+HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
+#endif
+
 // Must come before HWY_COMPILER_CLANGCL
 #include <immintrin.h>  // AVX2+
 
-#include "hwy/base.h"
-
 #if HWY_COMPILER_CLANGCL
 // Including <immintrin.h> should be enough, but Clang's headers helpfully skip
 // including these headers when _MSC_VER is defined, like when using clang-cl.
@@ -595,8 +604,9 @@ HWY_API Vec256<double> IfThenZeroElse(Ma
   return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
 }
 
-template <typename T, HWY_IF_FLOAT(T)>
+template <typename T>
 HWY_API Vec256<T> ZeroIfNegative(const Vec256<T> v) {
+  static_assert(IsSigned<T>(), "Only for float");
   // AVX3 MaskFromVec only looks at the MSB
   return IfThenZeroElse(MaskFromVec(v), v);
 }
@@ -833,8 +843,9 @@ HWY_API Vec256<T> IfThenZeroElse(Mask256
   return AndNot(VecFromMask(Full256<T>(), mask), no);
 }
 
-template <typename T, HWY_IF_FLOAT(T)>
+template <typename T>
 HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
+  static_assert(IsSigned<T>(), "Only for float");
   const auto zero = Zero(Full256<T>());
   // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
   return IfThenElse(MaskFromVec(v), zero, v);
@@ -1135,11 +1146,10 @@ HWY_API Mask256<double> operator==(const
 
 // ------------------------------ Inequality
 
-template <typename T, HWY_IF_NOT_FLOAT(T)>
+template <typename T>
 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
   return Not(a == b);
 }
-
 HWY_API Mask256<float> operator!=(const Vec256<float> a,
                                   const Vec256<float> b) {
   return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
@@ -1151,6 +1161,9 @@ HWY_API Mask256<double> operator!=(const
 
 // ------------------------------ Strict inequality
 
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
 // Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8
 // to perform an unsigned comparison instead of the intended signed. Workaround
 // is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy
@@ -1160,7 +1173,8 @@ HWY_API Mask256<double> operator!=(const
 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
 #endif
 
-HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
+HWY_API Mask256<int8_t> Gt(hwy::SignedTag /*tag*/, Vec256<int8_t> a,
+                           Vec256<int8_t> b) {
 #if HWY_AVX2_GCC_CMPGT8_WORKAROUND
   using i8x32 = signed char __attribute__((__vector_size__(32)));
   return Mask256<int8_t>{static_cast<__m256i>(reinterpret_cast<i8x32>(a.raw) >
@@ -1169,34 +1183,43 @@ HWY_API Mask256<int8_t> operator>(Vec256
   return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
 #endif
 }
-HWY_API Mask256<int16_t> operator>(const Vec256<int16_t> a,
-                                   const Vec256<int16_t> b) {
+HWY_API Mask256<int16_t> Gt(hwy::SignedTag /*tag*/, Vec256<int16_t> a,
+                            Vec256<int16_t> b) {
   return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
 }
-HWY_API Mask256<int32_t> operator>(const Vec256<int32_t> a,
-                                   const Vec256<int32_t> b) {
+HWY_API Mask256<int32_t> Gt(hwy::SignedTag /*tag*/, Vec256<int32_t> a,
+                            Vec256<int32_t> b) {
   return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
 }
-HWY_API Mask256<int64_t> operator>(const Vec256<int64_t> a,
-                                   const Vec256<int64_t> b) {
+HWY_API Mask256<int64_t> Gt(hwy::SignedTag /*tag*/, Vec256<int64_t> a,
+                            Vec256<int64_t> b) {
   return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
 }
 
-template <typename T, HWY_IF_UNSIGNED(T)>
-HWY_API Mask256<T> operator>(const Vec256<T> a, const Vec256<T> b) {
+template <typename T>
+HWY_INLINE Mask256<T> Gt(hwy::UnsignedTag /*tag*/, Vec256<T> a, Vec256<T> b) {
   const Full256<T> du;
   const RebindToSigned<decltype(du)> di;
   const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
   return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
 }
 
-HWY_API Mask256<float> operator>(const Vec256<float> a, const Vec256<float> b) {
+HWY_API Mask256<float> Gt(hwy::FloatTag /*tag*/, Vec256<float> a,
+                          Vec256<float> b) {
   return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
 }
-HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
+HWY_API Mask256<double> Gt(hwy::FloatTag /*tag*/, Vec256<double> a,
+                           Vec256<double> b) {
   return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
 }
 
+}  // namespace detail
+
+template <typename T>
+HWY_API Mask256<T> operator>(Vec256<T> a, Vec256<T> b) {
+  return detail::Gt(hwy::TypeTag<T>(), a, b);
+}
+
 // ------------------------------ Weak inequality
 
 HWY_API Mask256<float> operator>=(const Vec256<float> a,
@@ -1857,16 +1880,27 @@ HWY_API Vec256<int8_t> ShiftRightSame(Ve
 
 // ------------------------------ Neg (Xor, Sub)
 
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec256<T> Neg(const Vec256<T> v) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T>
+HWY_INLINE Vec256<T> Neg(hwy::FloatTag /*tag*/, const Vec256<T> v) {
   return Xor(v, SignBit(Full256<T>()));
 }
 
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec256<T> Neg(const Vec256<T> v) {
+// Not floating-point
+template <typename T>
+HWY_INLINE Vec256<T> Neg(hwy::NonFloatTag /*tag*/, const Vec256<T> v) {
   return Zero(Full256<T>()) - v;
 }
 
+}  // namespace detail
+
+template <typename T>
+HWY_API Vec256<T> Neg(const Vec256<T> v) {
+  return detail::Neg(hwy::IsFloatTag<T>(), v);
+}
+
 // ------------------------------ Floating-point mul / div
 
 HWY_API Vec256<float> operator*(const Vec256<float> a, const Vec256<float> b) {
@@ -2065,8 +2099,9 @@ HWY_API Mask256<double> IsFinite(const V
 
 #else
 
-template <typename T, HWY_IF_FLOAT(T)>
+template <typename T>
 HWY_API Mask256<T> IsInf(const Vec256<T> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Full256<T> d;
   const RebindToSigned<decltype(d)> di;
   const VFromD<decltype(di)> vi = BitCast(di, v);
@@ -2075,8 +2110,9 @@ HWY_API Mask256<T> IsInf(const Vec256<T>
 }
 
 // Returns whether normal/subnormal/zero.
-template <typename T, HWY_IF_FLOAT(T)>
+template <typename T>
 HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Full256<T> d;
   const RebindToUnsigned<decltype(d)> du;
   const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
@@ -2206,11 +2242,7 @@ HWY_API Vec256<double> MaskedLoad(Mask25
 // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
 template <typename T>
 HWY_API Vec256<T> LoadDup128(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
-#if HWY_LOADDUP_ASM
-  __m256i out;
-  asm("vbroadcasti128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
-  return Vec256<T>{out};
-#elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
+#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
   // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
   // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
   // upper half undefined) is fine because we're overwriting that anyway.
@@ -2225,11 +2257,7 @@ HWY_API Vec256<T> LoadDup128(Full256<T>
 }
 HWY_API Vec256<float> LoadDup128(Full256<float> /* tag */,
                                  const float* const HWY_RESTRICT p) {
-#if HWY_LOADDUP_ASM
-  __m256 out;
-  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
-  return Vec256<float>{out};
-#elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
+#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
   const __m128 v128 = LoadU(Full128<float>(), p).raw;
   return Vec256<float>{
       _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
@@ -2239,11 +2267,7 @@ HWY_API Vec256<float> LoadDup128(Full256
 }
 HWY_API Vec256<double> LoadDup128(Full256<double> /* tag */,
                                   const double* const HWY_RESTRICT p) {
-#if HWY_LOADDUP_ASM
-  __m256d out;
-  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
-  return Vec256<double>{out};
-#elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
+#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
   const __m128d v128 = LoadU(Full128<double>(), p).raw;
   return Vec256<double>{
       _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
@@ -2651,33 +2675,43 @@ HWY_API T GetLane(const Vec256<T> v) {
 // compiler could decide to optimize out code that relies on this.
 //
 // The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the
-// zeroing, but it is not available on MSVC nor GCC until 10.1. For older GCC,
-// we can still obtain the desired code thanks to pattern recognition; note that
-// the expensive insert instruction is not actually generated, see
-// https://gcc.godbolt.org/z/1MKGaP.
+// zeroing, but it is not available on MSVC until 15.7 nor GCC until 10.1. For
+// older GCC, we can still obtain the desired code thanks to pattern
+// recognition; note that the expensive insert instruction is not actually
+// generated, see https://gcc.godbolt.org/z/1MKGaP.
+
+#if !defined(HWY_HAVE_ZEXT)
+#if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) ||  \
+    (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
+    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000)
+#define HWY_HAVE_ZEXT 1
+#else
+#define HWY_HAVE_ZEXT 0
+#endif
+#endif  // defined(HWY_HAVE_ZEXT)
 
 template <typename T>
 HWY_API Vec256<T> ZeroExtendVector(Full256<T> /* tag */, Vec128<T> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
+#if HWY_HAVE_ZEXT
+return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
 #else
-  return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
+  return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
 #endif
 }
 HWY_API Vec256<float> ZeroExtendVector(Full256<float> /* tag */,
                                        Vec128<float> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
-#else
+#if HWY_HAVE_ZEXT
   return Vec256<float>{_mm256_zextps128_ps256(lo.raw)};
+#else
+  return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
 #endif
 }
 HWY_API Vec256<double> ZeroExtendVector(Full256<double> /* tag */,
                                         Vec128<double> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
-#else
+#if HWY_HAVE_ZEXT
   return Vec256<double>{_mm256_zextpd128_pd256(lo.raw)};
+#else
+  return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
 #endif
 }
 
@@ -3657,12 +3691,14 @@ HWY_API Vec256<TI> TableLookupBytes(cons
 
 // ------------------------------ Shl (Mul, ZipLower)
 
-#if HWY_TARGET > HWY_AVX3  // AVX2 or older
 namespace detail {
 
+#if HWY_TARGET > HWY_AVX3  // AVX2 or older
+
 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+template <typename T>
 HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
+  static_assert(sizeof(T) == 2, "Only for 16-bit");
   const Full256<T> d;
   const RepartitionToWide<decltype(d)> dw;
   const Rebind<float, decltype(dw)> df;
@@ -3680,63 +3716,66 @@ HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(
   return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
 }
 
-}  // namespace detail
 #endif  // HWY_TARGET > HWY_AVX3
 
-HWY_API Vec256<uint16_t> operator<<(const Vec256<uint16_t> v,
-                                    const Vec256<uint16_t> bits) {
+HWY_INLINE Vec256<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> v,
+                                Vec256<uint16_t> bits) {
 #if HWY_TARGET <= HWY_AVX3
   return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
 #else
-  return v * detail::Pow2(bits);
+  return v * Pow2(bits);
 #endif
 }
 
-HWY_API Vec256<uint32_t> operator<<(const Vec256<uint32_t> v,
-                                    const Vec256<uint32_t> bits) {
+HWY_INLINE Vec256<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint32_t> v,
+                                Vec256<uint32_t> bits) {
   return Vec256<uint32_t>{_mm256_sllv_epi32(v.raw, bits.raw)};
 }
 
-HWY_API Vec256<uint64_t> operator<<(const Vec256<uint64_t> v,
-                                    const Vec256<uint64_t> bits) {
+HWY_INLINE Vec256<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint64_t> v,
+                                Vec256<uint64_t> bits) {
   return Vec256<uint64_t>{_mm256_sllv_epi64(v.raw, bits.raw)};
 }
 
-// Signed left shift is the same as unsigned.
-template <typename T, HWY_IF_SIGNED(T)>
-HWY_API Vec256<T> operator<<(const Vec256<T> v, const Vec256<T> bits) {
+template <typename T>
+HWY_INLINE Vec256<T> Shl(hwy::SignedTag /*tag*/, Vec256<T> v, Vec256<T> bits) {
+  // Signed left shifts are the same as unsigned.
   const Full256<T> di;
   const Full256<MakeUnsigned<T>> du;
-  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
+  return BitCast(di,
+                 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
+}
+
+}  // namespace detail
+
+template <typename T>
+HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) {
+  return detail::Shl(hwy::TypeTag<T>(), v, bits);
 }
 
 // ------------------------------ Shr (MulHigh, IfThenElse, Not)
 
-HWY_API Vec256<uint16_t> operator>>(const Vec256<uint16_t> v,
-                                    const Vec256<uint16_t> bits) {
+HWY_API Vec256<uint16_t> operator>>(Vec256<uint16_t> v, Vec256<uint16_t> bits) {
 #if HWY_TARGET <= HWY_AVX3
   return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
 #else
-  const Full256<uint16_t> d;
+  Full256<uint16_t> d;
   // For bits=0, we cannot mul by 2^16, so fix the result later.
-  const auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits));
+  auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits));
   // Replace output with input where bits == 0.
   return IfThenElse(bits == Zero(d), v, out);
 #endif
 }
 
-HWY_API Vec256<uint32_t> operator>>(const Vec256<uint32_t> v,
-                                    const Vec256<uint32_t> bits) {
+HWY_API Vec256<uint32_t> operator>>(Vec256<uint32_t> v, Vec256<uint32_t> bits) {
   return Vec256<uint32_t>{_mm256_srlv_epi32(v.raw, bits.raw)};
 }
 
-HWY_API Vec256<uint64_t> operator>>(const Vec256<uint64_t> v,
-                                    const Vec256<uint64_t> bits) {
+HWY_API Vec256<uint64_t> operator>>(Vec256<uint64_t> v, Vec256<uint64_t> bits) {
   return Vec256<uint64_t>{_mm256_srlv_epi64(v.raw, bits.raw)};
 }
 
-HWY_API Vec256<int16_t> operator>>(const Vec256<int16_t> v,
-                                   const Vec256<int16_t> bits) {
+HWY_API Vec256<int16_t> operator>>(Vec256<int16_t> v, Vec256<int16_t> bits) {
 #if HWY_TARGET <= HWY_AVX3
   return Vec256<int16_t>{_mm256_srav_epi16(v.raw, bits.raw)};
 #else
@@ -3744,13 +3783,11 @@ HWY_API Vec256<int16_t> operator>>(const
 #endif
 }
 
-HWY_API Vec256<int32_t> operator>>(const Vec256<int32_t> v,
-                                   const Vec256<int32_t> bits) {
+HWY_API Vec256<int32_t> operator>>(Vec256<int32_t> v, Vec256<int32_t> bits) {
   return Vec256<int32_t>{_mm256_srav_epi32(v.raw, bits.raw)};
 }
 
-HWY_API Vec256<int64_t> operator>>(const Vec256<int64_t> v,
-                                   const Vec256<int64_t> bits) {
+HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
 #if HWY_TARGET <= HWY_AVX3
   return Vec256<int64_t>{_mm256_srav_epi64(v.raw, bits.raw)};
 #else
@@ -4040,6 +4077,107 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(con
   return BitCast(Full64<uint8_t>(), pair);
 }
 
+// ------------------------------ Truncations
+
+namespace detail {
+
+// LO and HI each hold four indices of bytes within a 128-bit block.
+template <uint32_t LO, uint32_t HI, typename T>
+HWY_INLINE Vec128<uint32_t> LookupAndConcatHalves(Vec256<T> v) {
+  const Full256<uint32_t> d32;
+
+#if HWY_TARGET <= HWY_AVX3_DL
+  alignas(32) constexpr uint32_t kMap[8] = {
+      LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0};
+  const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw);
+#else
+  alignas(32) static constexpr uint32_t kMap[8] = {LO,  HI,  ~0u, ~0u,
+                                                   ~0u, ~0u, LO,  HI};
+  const auto quad = TableLookupBytes(v, Load(d32, kMap));
+  const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC);
+  // Possible alternative:
+  // const auto lo = LowerHalf(quad);
+  // const auto hi = UpperHalf(Full128<uint32_t>(), quad);
+  // const auto result = lo | hi;
+#endif
+
+  return Vec128<uint32_t>{_mm256_castsi256_si128(result)};
+}
+
+// LO and HI each hold two indices of bytes within a 128-bit block.
+template <uint16_t LO, uint16_t HI, typename T>
+HWY_INLINE Vec128<uint32_t, 2> LookupAndConcatQuarters(Vec256<T> v) {
+  const Full256<uint16_t> d16;
+
+#if HWY_TARGET <= HWY_AVX3_DL
+  alignas(32) constexpr uint16_t kMap[16] = {
+      LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  const auto result = _mm256_permutexvar_epi8(v.raw, Load(d16, kMap).raw);
+  return LowerHalf(Vec128<uint32_t>{_mm256_castsi256_si128(result)});
+#else
+  constexpr uint16_t ff = static_cast<uint16_t>(~0u);
+  alignas(32) static constexpr uint16_t kMap[16] = {
+      LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff};
+  const auto quad = TableLookupBytes(v, Load(d16, kMap));
+  const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC);
+  const auto half = _mm256_castsi256_si128(mixed);
+  return LowerHalf(Vec128<uint32_t>{_mm_packus_epi32(half, half)});
+#endif
+}
+
+}  // namespace detail
+
+HWY_API Vec128<uint8_t, 4> TruncateTo(Simd<uint8_t, 4, 0> /* tag */,
+                                      const Vec256<uint64_t> v) {
+  const Full256<uint32_t> d32;
+#if HWY_TARGET <= HWY_AVX3_DL
+  alignas(32) constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0, 0, 0, 0, 0};
+  const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw);
+  return LowerHalf(LowerHalf(LowerHalf(Vec256<uint8_t>{result})));
+#else
+  alignas(32) static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u,
+                                                   0x0800FFFFu, ~0u, ~0u, ~0u};
+  const auto quad = TableLookupBytes(v, Load(d32, kMap));
+  const auto lo = LowerHalf(quad);
+  const auto hi = UpperHalf(Full128<uint32_t>(), quad);
+  const auto result = lo | hi;
+  return LowerHalf(LowerHalf(Vec128<uint8_t>{result.raw}));
+#endif
+}
+
+HWY_API Vec128<uint16_t, 4> TruncateTo(Simd<uint16_t, 4, 0> /* tag */,
+                                       const Vec256<uint64_t> v) {
+  const auto result = detail::LookupAndConcatQuarters<0x100, 0x908>(v);
+  return Vec128<uint16_t, 4>{result.raw};
+}
+
+HWY_API Vec128<uint32_t> TruncateTo(Simd<uint32_t, 4, 0> /* tag */,
+                                    const Vec256<uint64_t> v) {
+  const Full256<uint32_t> d32;
+  alignas(32) constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6};
+  const auto v32 =
+      TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven));
+  return LowerHalf(Vec256<uint32_t>{v32.raw});
+}
+
+HWY_API Vec128<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> /* tag */,
+                                      const Vec256<uint32_t> v) {
+  const auto full = detail::LookupAndConcatQuarters<0x400, 0xC08>(v);
+  return Vec128<uint8_t, 8>{full.raw};
+}
+
+HWY_API Vec128<uint16_t> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
+                                    const Vec256<uint32_t> v) {
+  const auto full = detail::LookupAndConcatHalves<0x05040100, 0x0D0C0908>(v);
+  return Vec128<uint16_t>{full.raw};
+}
+
+HWY_API Vec128<uint8_t> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
+                                   const Vec256<uint16_t> v) {
+  const auto full = detail::LookupAndConcatHalves<0x06040200, 0x0E0C0A08>(v);
+  return Vec128<uint8_t>{full.raw};
+}
+
 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
 
 HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
@@ -4373,8 +4511,10 @@ template <typename T, HWY_IF_LANE_SIZE(T
 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
   // See CompressIsPartition.
   alignas(16) constexpr uint64_t packed_array[16] = {
-      0x3210, 0x3210, 0x3201, 0x3210, 0x3102, 0x3120, 0x3021, 0x3210,
-      0x2103, 0x2130, 0x2031, 0x2310, 0x1032, 0x1320, 0x0321, 0x3210};
+      // PrintCompress64x4NibbleTables
+      0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
+      0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
+      0x00001032, 0x00001320, 0x00000321, 0x00003210};
 
   // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
   // _mm256_permutexvar_epi64 will ignore the upper bits.
@@ -4386,8 +4526,39 @@ HWY_API Vec256<T> Compress(Vec256<T> v,
   return TableLookupLanes(v, indices);
 }
 
-// ------------------------------ CompressBits (LoadMaskBits)
+// ------------------------------ CompressNot (Compress)
+
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec256<T> CompressNot(Vec256<T> v, const Mask256<T> mask) {
+  return Compress(v, Not(mask));
+}
+
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> mask) {
+  // See CompressIsPartition.
+  alignas(16) constexpr uint64_t packed_array[16] = {
+      // PrintCompressNot64x4NibbleTables
+      0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
+      0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
+      0x00003210, 0x00003201, 0x00003210, 0x00003210};
 
+  // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
+  // _mm256_permutexvar_epi64 will ignore the upper bits.
+  const Full256<T> d;
+  const RebindToUnsigned<decltype(d)> du64;
+  const auto packed = Set(du64, packed_array[mask.raw]);
+  alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
+  const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
+  return TableLookupLanes(v, indices);
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
+                                           Mask256<uint64_t> mask) {
+  return CompressNot(v, mask);
+}
+
+// ------------------------------ CompressBits (LoadMaskBits)
 template <typename T>
 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
   return Compress(v, LoadMaskBits(Full256<T>(), bits));
@@ -4478,8 +4649,6 @@ HWY_API size_t CompressStore(Vec256<doub
 
 // ------------------------------ CompressBlendedStore (CompressStore)
 
-#if HWY_TARGET <= HWY_AVX3
-
 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
                                     T* HWY_RESTRICT unaligned) {
@@ -4504,35 +4673,6 @@ HWY_API size_t CompressBlendedStore(Vec2
 #endif
 }
 
-#else  // AVX2
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  const size_t count = CountTrue(m);
-  BlendedStore(FirstN(d, count), d, Compress(v, m));
-  return count;
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  const size_t count = CountTrue(d, m);
-  const Vec256<T> compressed = Compress(v, m);
-#if HWY_MEM_OPS_MIGHT_FAULT
-  // BlendedStore tests mask for each lane, but we know that the mask is
-  // FirstN, so we can just copy.
-  alignas(32) T buf[16];
-  Store(compressed, d, buf);
-  memcpy(unaligned, buf, count * sizeof(T));
-#else
-  BlendedStore(compressed, FirstN(d, count), d, unaligned);
-#endif
-  return count;
-}
-
-#endif  // AVX2
-
 // ------------------------------ CompressBitsStore (LoadMaskBits)
 
 template <typename T>
@@ -4736,6 +4876,7 @@ HWY_INLINE Indices256<uint32_t> IndicesF
   // and unavailable in 32-bit builds. We instead compress each index into 4
   // bits, for a total of 1 KiB.
   alignas(16) constexpr uint32_t packed_array[256] = {
+      // PrintCompress32x8Tables
       0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
       0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
       0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
@@ -4797,18 +4938,99 @@ HWY_INLINE Indices256<uint32_t> IndicesF
   // For 64-bit, we still need 32-bit indices because there is no 64-bit
   // permutevar, but there are only 4 lanes, so we can afford to skip the
   // unpacking and load the entire index vector directly.
-  alignas(32) constexpr uint32_t packed_array[128] = {
-      0, 1, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
-      2, 3, 0, 1, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
-      4, 5, 0, 1, 2, 3, 6, 7, /**/ 0, 1, 4, 5, 2, 3, 6, 7,  //
-      2, 3, 4, 5, 0, 1, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
-      6, 7, 0, 1, 2, 3, 4, 5, /**/ 0, 1, 6, 7, 2, 3, 4, 5,  //
-      2, 3, 6, 7, 0, 1, 4, 5, /**/ 0, 1, 2, 3, 6, 7, 4, 5,  //
-      4, 5, 6, 7, 0, 1, 2, 3, /**/ 0, 1, 4, 5, 6, 7, 2, 3,  //
-      2, 3, 4, 5, 6, 7, 0, 1, /**/ 0, 1, 2, 3, 4, 5, 6, 7};
-  return Indices256<uint32_t>{Load(d32, packed_array + 8 * mask_bits).raw};
+  alignas(32) constexpr uint32_t u32_indices[128] = {
+      // PrintCompress64x4PairTables
+      0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
+      6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5,
+      2, 3, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 6, 7,
+      0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5,
+      0, 1, 2, 3, 6, 7, 4, 5, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7,
+      2, 3, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
+  return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
+}
+
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
+                                                   uint64_t mask_bits) {
+  const RebindToUnsigned<decltype(d)> d32;
+  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
+  // of SetTableIndices would require 8 KiB, a large part of L1D. The other
+  // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
+  // and unavailable in 32-bit builds. We instead compress each index into 4
+  // bits, for a total of 1 KiB.
+  alignas(16) constexpr uint32_t packed_array[256] = {
+      // PrintCompressNot32x8Tables
+      0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
+      0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
+      0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
+      0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
+      0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
+      0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
+      0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
+      0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
+      0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
+      0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
+      0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
+      0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
+      0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
+      0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
+      0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
+      0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
+      0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
+      0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
+      0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
+      0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
+      0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
+      0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
+      0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
+      0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
+      0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
+      0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
+      0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
+      0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
+      0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
+      0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
+      0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
+      0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
+      0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
+      0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
+      0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
+      0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
+      0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
+      0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
+      0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
+      0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
+      0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
+      0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
+      0x76543210, 0x76543201, 0x76543210, 0x76543210};
+
+  // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31.
+  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
+  // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
+  // latency, it may be faster to use LoadDup128 and PSHUFB.
+  const auto packed = Set(d32, packed_array[mask_bits]);
+  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
+  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
 }
 
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
+                                                   uint64_t mask_bits) {
+  const Repartition<uint32_t, decltype(d)> d32;
+
+  // For 64-bit, we still need 32-bit indices because there is no 64-bit
+  // permutevar, but there are only 4 lanes, so we can afford to skip the
+  // unpacking and load the entire index vector directly.
+  alignas(32) constexpr uint32_t u32_indices[128] = {
+      // PrintCompressNot64x4PairTables
+      0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 4, 5, 6, 7,
+      2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 6, 7, 4, 5, 2, 3, 6, 7,
+      0, 1, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 0, 1,
+      2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
+      4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
+      6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+  return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
+}
 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
   const Full256<T> d;
@@ -4859,12 +5081,39 @@ HWY_INLINE Vec256<T> Compress(Vec256<T>
   }
 }
 
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
+HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
+  const Full256<T> d;
+  const Repartition<uint32_t, decltype(d)> du32;
+
+  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
+  const auto indices = IndicesFromNotBits(d, mask_bits);
+  return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
+}
+
+// LUTs are infeasible for 2^16 possible masks, so splice together two
+// half-vector Compress.
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
+  // Compress ensures only the lower 16 bits are set, so flip those.
+  return Compress(v, mask_bits ^ 0xFFFF);
+}
+
 }  // namespace detail
 
 template <typename T>
 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
-  const uint64_t mask_bits = detail::BitsFromMask(m);
-  return detail::Compress(v, mask_bits);
+  return detail::Compress(v, detail::BitsFromMask(m));
+}
+
+template <typename T>
+HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
+  return detail::CompressNot(v, detail::BitsFromMask(m));
+}
+
+HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
+                                           Mask256<uint64_t> mask) {
+  return CompressNot(v, mask);
 }
 
 template <typename T>
@@ -4897,7 +5146,7 @@ HWY_API size_t CompressStore(Vec256<T> v
   return count;
 }
 
-template <typename T>
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
                                     T* HWY_RESTRICT unaligned) {
   const uint64_t mask_bits = detail::BitsFromMask(m);
@@ -4910,6 +5159,25 @@ HWY_API size_t CompressBlendedStore(Vec2
   return count;
 }
 
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
+                                    T* HWY_RESTRICT unaligned) {
+  const uint64_t mask_bits = detail::BitsFromMask(m);
+  const size_t count = PopCount(mask_bits);
+  const Vec256<T> compressed = detail::Compress(v, mask_bits);
+
+#if HWY_MEM_OPS_MIGHT_FAULT  // true if HWY_IS_MSAN
+  // BlendedStore tests mask for each lane, but we know that the mask is
+  // FirstN, so we can just copy.
+  alignas(32) T buf[16];
+  Store(compressed, d, buf);
+  memcpy(unaligned, buf, count * sizeof(T));
+#else
+  BlendedStore(compressed, FirstN(d, count), d, unaligned);
+#endif
+  return count;
+}
+
 template <typename T>
 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
                                  Full256<T> d, T* HWY_RESTRICT unaligned) {
@@ -5156,3 +5424,7 @@ HWY_API Vec256<T> MaxOfLanes(Full256<T>
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();
+
+// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
+// the warning seems to be issued at the call site of intrinsics, i.e. our code.
+HWY_DIAGNOSTICS(pop)
diff -pruN 0.17.0-11/hwy/ops/x86_512-inl.h 1.0.0-2/hwy/ops/x86_512-inl.h
--- 0.17.0-11/hwy/ops/x86_512-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/ops/x86_512-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -25,7 +25,7 @@
 // Avoid uninitialized warnings in GCC's avx512fintrin.h - see
 // https://github.com/google/highway/issues/710)
 HWY_DIAGNOSTICS(push)
-#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
+#if HWY_COMPILER_GCC_ACTUAL
 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
 HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
 #endif
@@ -467,7 +467,7 @@ namespace detail {
 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
 HWY_INLINE Mask512<T> FirstN(size_t n) {
   Mask512<T> m;
-  const uint32_t all = ~uint32_t(0);
+  const uint32_t all = ~uint32_t{0};
   // BZHI only looks at the lower 8 bits of n!
   m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u32(all, n));
   return m;
@@ -475,7 +475,7 @@ HWY_INLINE Mask512<T> FirstN(size_t n) {
 
 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
 HWY_INLINE Mask512<T> FirstN(size_t n) {
-  const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
+  const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0};
   return Mask512<T>{static_cast<__mmask64>(bits)};
 }
 
@@ -486,7 +486,7 @@ template <typename T>
 HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
 #if HWY_ARCH_X86_64
   Mask512<T> m;
-  const uint64_t all = ~uint64_t(0);
+  const uint64_t all = ~uint64_t{0};
   // BZHI only looks at the lower 8 bits of n!
   m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u64(all, n));
   return m;
@@ -1894,40 +1894,19 @@ HWY_API Vec512<double> MaskedLoad(Mask51
 template <typename T>
 HWY_API Vec512<T> LoadDup128(Full512<T> /* tag */,
                              const T* const HWY_RESTRICT p) {
-  // Clang 3.9 generates VINSERTF128 which is slower, but inline assembly leads
-  // to "invalid output size for constraint" without -mavx512:
-  // https://gcc.godbolt.org/z/-Jt_-F
-#if HWY_LOADDUP_ASM
-  __m512i out;
-  asm("vbroadcasti128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
-  return Vec512<T>{out};
-#else
   const auto x4 = LoadU(Full128<T>(), p);
   return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
-#endif
 }
 HWY_API Vec512<float> LoadDup128(Full512<float> /* tag */,
                                  const float* const HWY_RESTRICT p) {
-#if HWY_LOADDUP_ASM
-  __m512 out;
-  asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
-  return Vec512<float>{out};
-#else
   const __m128 x4 = _mm_loadu_ps(p);
   return Vec512<float>{_mm512_broadcast_f32x4(x4)};
-#endif
 }
 
 HWY_API Vec512<double> LoadDup128(Full512<double> /* tag */,
                                   const double* const HWY_RESTRICT p) {
-#if HWY_LOADDUP_ASM
-  __m512d out;
-  asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
-  return Vec512<double>{out};
-#else
   const __m128d x2 = _mm_loadu_pd(p);
   return Vec512<double>{_mm512_broadcast_f64x2(x2)};
-#endif
 }
 
 // ------------------------------ Store
@@ -2218,39 +2197,28 @@ HWY_API T GetLane(const Vec512<T> v) {
 
 // ------------------------------ ZeroExtendVector
 
-// Unfortunately the initial _mm512_castsi256_si512 intrinsic leaves the upper
-// bits undefined. Although it makes sense for them to be zero (EVEX encoded
-// instructions have that effect), a compiler could decide to optimize out code
-// that relies on this.
-//
-// The newer _mm512_zextsi256_si512 intrinsic fixes this by specifying the
-// zeroing, but it is not available on GCC until 10.1. For older GCC, we can
-// still obtain the desired code thanks to pattern recognition; note that the
-// expensive insert instruction is not actually generated, see
-// https://gcc.godbolt.org/z/1MKGaP.
-
 template <typename T>
 HWY_API Vec512<T> ZeroExtendVector(Full512<T> /* tag */, Vec256<T> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)};
-#else
+#if HWY_HAVE_ZEXT  // See definition/comment in x86_256-inl.h.
   return Vec512<T>{_mm512_zextsi256_si512(lo.raw)};
+#else
+  return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)};
 #endif
 }
 HWY_API Vec512<float> ZeroExtendVector(Full512<float> /* tag */,
                                        Vec256<float> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec512<float>{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)};
-#else
+#if HWY_HAVE_ZEXT
   return Vec512<float>{_mm512_zextps256_ps512(lo.raw)};
+#else
+  return Vec512<float>{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)};
 #endif
 }
 HWY_API Vec512<double> ZeroExtendVector(Full512<double> /* tag */,
                                         Vec256<double> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec512<double>{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)};
-#else
+#if HWY_HAVE_ZEXT
   return Vec512<double>{_mm512_zextpd256_pd512(lo.raw)};
+#else
+  return Vec512<double>{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)};
 #endif
 }
 
@@ -3319,6 +3287,106 @@ HWY_API Vec128<uint8_t, 16> U8FromU32(co
   return LowerHalf(LowerHalf(bytes));
 }
 
+// ------------------------------ Truncations
+
+HWY_API Vec128<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> d,
+                                      const Vec512<uint64_t> v) {
+#if HWY_TARGET == HWY_AVX3_DL
+  (void)d;
+  const Full512<uint8_t> d8;
+  alignas(16) static constexpr uint8_t k8From64[16] = {
+    0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56};
+  const Vec512<uint8_t> bytes{
+      _mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)};
+  return LowerHalf(LowerHalf(LowerHalf(bytes)));
+#else
+  const Full512<uint32_t> d32;
+  alignas(64) constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
+                                              0, 2, 4, 6, 8, 10, 12, 14};
+  const Vec512<uint32_t> even{
+      _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)};
+  return TruncateTo(d, LowerHalf(even));
+#endif
+}
+
+HWY_API Vec128<uint16_t, 8> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
+                                       const Vec512<uint64_t> v) {
+  const Full512<uint16_t> d16;
+  alignas(16) static constexpr uint16_t k16From64[8] = {
+      0, 4, 8, 12, 16, 20, 24, 28};
+  const Vec512<uint16_t> bytes{
+      _mm512_permutexvar_epi16(LoadDup128(d16, k16From64).raw, v.raw)};
+  return LowerHalf(LowerHalf(bytes));
+}
+
+HWY_API Vec256<uint32_t> TruncateTo(Simd<uint32_t, 8, 0> /* tag */,
+                                    const Vec512<uint64_t> v) {
+  const Full512<uint32_t> d32;
+  alignas(64) constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
+                                              0, 2, 4, 6, 8, 10, 12, 14};
+  const Vec512<uint32_t> even{
+      _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)};
+  return LowerHalf(even);
+}
+
+HWY_API Vec128<uint8_t, 16> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
+                                       const Vec512<uint32_t> v) {
+#if HWY_TARGET == HWY_AVX3_DL
+  const Full512<uint8_t> d8;
+  alignas(16) static constexpr uint8_t k8From32[16] = {
+    0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
+  const Vec512<uint8_t> bytes{
+      _mm512_permutexvar_epi32(LoadDup128(d8, k8From32).raw, v.raw)};
+#else
+  const Full512<uint32_t> d32;
+  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
+  // lowest 4 bytes.
+  alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
+                                                       ~0u};
+  const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
+  // Gather the lowest 4 bytes of 4 128-bit blocks.
+  alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
+  const Vec512<uint8_t> bytes{
+      _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
+#endif
+  return LowerHalf(LowerHalf(bytes));
+}
+
+HWY_API Vec256<uint16_t> TruncateTo(Simd<uint16_t, 16, 0> /* tag */,
+                                    const Vec512<uint32_t> v) {
+  const Full512<uint16_t> d16;
+  alignas(64) static constexpr uint16_t k16From32[32] = {
+      0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+      0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
+  const Vec512<uint16_t> bytes{
+      _mm512_permutexvar_epi16(Load(d16, k16From32).raw, v.raw)};
+  return LowerHalf(bytes);
+}
+
+HWY_API Vec256<uint8_t> TruncateTo(Simd<uint8_t, 32, 0> /* tag */,
+                                   const Vec512<uint16_t> v) {
+#if HWY_TARGET == HWY_AVX3_DL
+  const Full512<uint8_t> d8;
+  alignas(64) static constexpr uint8_t k8From16[64] = {
+     0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+    32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+     0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+    32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
+  const Vec512<uint8_t> bytes{
+      _mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)};
+#else
+  const Full512<uint32_t> d32;
+  alignas(16) static constexpr uint32_t k16From32[4] = {
+      0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u};
+  const auto quads = TableLookupBytes(v, LoadDup128(d32, k16From32));
+  alignas(64) static constexpr uint32_t kIndex32[16] = {
+      0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
+  const Vec512<uint8_t> bytes{
+      _mm512_permutexvar_epi32(Load(d32, kIndex32).raw, quads.raw)};
+#endif
+  return LowerHalf(bytes);
+}
+
 // ------------------------------ Convert integer <=> floating point
 
 HWY_API Vec512<float> ConvertTo(Full512<float> /* tag */,
@@ -3570,6 +3638,7 @@ template <typename T, HWY_IF_LANE_SIZE(T
 HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
   // See CompressIsPartition. u64 is faster than u32.
   alignas(16) constexpr uint64_t packed_array[256] = {
+      // PrintCompress32x8Tables
       0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
       0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
       0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
@@ -3686,6 +3755,77 @@ HWY_API Vec512<T> Compress(Vec512<T> v,
   return BitCast(d, cu);
 }
 
+// ------------------------------ CompressNot
+
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec512<T> CompressNot(Vec512<T> v, const Mask512<T> mask) {
+  return Compress(v, Not(mask));
+}
+
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
+  // See CompressIsPartition. u64 is faster than u32.
+  alignas(16) constexpr uint64_t packed_array[256] = {
+      // PrintCompressNot32x8Tables
+      0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
+      0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
+      0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
+      0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
+      0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
+      0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
+      0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
+      0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
+      0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
+      0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
+      0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
+      0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
+      0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
+      0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
+      0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
+      0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
+      0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
+      0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
+      0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
+      0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
+      0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
+      0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
+      0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
+      0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
+      0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
+      0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
+      0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
+      0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
+      0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
+      0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
+      0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
+      0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
+      0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
+      0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
+      0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
+      0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
+      0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
+      0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
+      0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
+      0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
+      0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
+      0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
+      0x76543210, 0x76543201, 0x76543210, 0x76543210};
+
+  // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
+  // _mm512_permutexvar_epi64 will ignore the upper bits.
+  const Full512<T> d;
+  const RebindToUnsigned<decltype(d)> du64;
+  const auto packed = Set(du64, packed_array[mask.raw]);
+  alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
+  const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
+  return TableLookupLanes(v, indices);
+}
+
+HWY_API Vec512<uint64_t> CompressBlocksNot(Vec512<uint64_t> v,
+                                           Mask512<uint64_t> mask) {
+  return CompressNot(v, mask);
+}
+
 // ------------------------------ CompressBits
 template <typename T>
 HWY_API Vec512<T> CompressBits(Vec512<T> v, const uint8_t* HWY_RESTRICT bits) {
diff -pruN 0.17.0-11/hwy/per_target.cc 1.0.0-2/hwy/per_target.cc
--- 0.17.0-11/hwy/per_target.cc	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/hwy/per_target.cc	2022-07-27 11:48:16.000000000 +0000
@@ -0,0 +1,50 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/per_target.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/per_target.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+// On SVE, Lanes rounds down to a power of two, but we want to know the actual
+// size here. Otherwise, hypothetical SVE with 48 bytes would round down to 32
+// and we'd enable HWY_SVE_256, and then fail reverse_test because Reverse on
+// HWY_SVE_256 requires the actual vector to be a power of two.
+#if HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE_256
+size_t GetVectorBytes() { return detail::AllHardwareLanes(hwy::SizeTag<1>()); }
+#else
+size_t GetVectorBytes() { return Lanes(ScalableTag<uint8_t>()); }
+#endif
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(GetVectorBytes);  // Local function.
+}  // namespace
+
+size_t VectorBytes() { return HWY_DYNAMIC_DISPATCH(GetVectorBytes)(); }
+
+}  // namespace hwy
+#endif  // HWY_ONCE
diff -pruN 0.17.0-11/hwy/per_target.h 1.0.0-2/hwy/per_target.h
--- 0.17.0-11/hwy/per_target.h	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/hwy/per_target.h	2022-07-27 11:48:16.000000000 +0000
@@ -0,0 +1,37 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_PER_TARGET_H_
+#define HIGHWAY_HWY_PER_TARGET_H_
+
+#include <stddef.h>
+
+// Per-target functions.
+
+namespace hwy {
+
+// Returns size in bytes of a vector, i.e. `Lanes(ScalableTag<uint8_t>())`.
+//
+// Do not cache the result, which may change after calling DisableTargets, or
+// if software requests a different vector size (e.g. when entering/exiting SME
+// streaming mode). Instead call this right before the code that depends on the
+// result, without any DisableTargets or SME transition in-between. Note that
+// this involves an indirect call, so prefer not to call this frequently nor
+// unnecessarily.
+size_t VectorBytes();
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_PER_TARGET_H_
diff -pruN 0.17.0-11/hwy/print.cc 1.0.0-2/hwy/print.cc
--- 0.17.0-11/hwy/print.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/print.cc	2022-07-27 11:48:16.000000000 +0000
@@ -28,12 +28,13 @@ HWY_DLLEXPORT void TypeName(const TypeIn
   const char prefix = info.is_float ? 'f' : (info.is_signed ? 'i' : 'u');
   // Omit the xN suffix for scalars.
   if (N == 1) {
-    snprintf(string100, 64, "%c%" PRIu64, prefix,
-             static_cast<uint64_t>(info.sizeof_t * 8));
+    // NOLINTNEXTLINE
+    snprintf(string100, 64, "%c%d", prefix,
+             static_cast<int>(info.sizeof_t * 8));
   } else {
-    snprintf(string100, 64, "%c%" PRIu64 "x%" PRIu64, prefix,
-             static_cast<uint64_t>(info.sizeof_t * 8),
-             static_cast<uint64_t>(N));
+    // NOLINTNEXTLINE
+    snprintf(string100, 64, "%c%dx%d", prefix,
+             static_cast<int>(info.sizeof_t * 8), static_cast<int>(N));
   }
 }
 
@@ -42,39 +43,39 @@ HWY_DLLEXPORT void ToString(const TypeIn
   if (info.sizeof_t == 1) {
     uint8_t byte;
     CopyBytes<1>(ptr, &byte);  // endian-safe: we ensured sizeof(T)=1.
-    snprintf(string100, 100, "0x%02X", byte);
+    snprintf(string100, 100, "0x%02X", byte);  // NOLINT
   } else if (info.sizeof_t == 2) {
     uint16_t bits;
     CopyBytes<2>(ptr, &bits);
-    snprintf(string100, 100, "0x%04X", bits);
+    snprintf(string100, 100, "0x%04X", bits);  // NOLINT
   } else if (info.sizeof_t == 4) {
     if (info.is_float) {
       float value;
       CopyBytes<4>(ptr, &value);
-      snprintf(string100, 100, "%g", double(value));
+      snprintf(string100, 100, "%g", static_cast<double>(value));  // NOLINT
     } else if (info.is_signed) {
       int32_t value;
       CopyBytes<4>(ptr, &value);
-      snprintf(string100, 100, "%d", value);
+      snprintf(string100, 100, "%d", value);  // NOLINT
     } else {
       uint32_t value;
       CopyBytes<4>(ptr, &value);
-      snprintf(string100, 100, "%u", value);
+      snprintf(string100, 100, "%u", value);  // NOLINT
     }
   } else {
     HWY_ASSERT(info.sizeof_t == 8);
     if (info.is_float) {
       double value;
       CopyBytes<8>(ptr, &value);
-      snprintf(string100, 100, "%g", value);
+      snprintf(string100, 100, "%g", value);  // NOLINT
     } else if (info.is_signed) {
       int64_t value;
       CopyBytes<8>(ptr, &value);
-      snprintf(string100, 100, "%" PRIi64 "", value);
+      snprintf(string100, 100, "%" PRIi64 "", value);  // NOLINT
     } else {
       uint64_t value;
       CopyBytes<8>(ptr, &value);
-      snprintf(string100, 100, "%" PRIu64 "", value);
+      snprintf(string100, 100, "%" PRIu64 "", value);  // NOLINT
     }
   }
 }
diff -pruN 0.17.0-11/hwy/print.h 1.0.0-2/hwy/print.h
--- 0.17.0-11/hwy/print.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/print.h	2022-07-27 11:48:16.000000000 +0000
@@ -19,6 +19,7 @@
 // Helpers for printing vector lanes.
 
 #include <stddef.h>
+#include <stdio.h>
 
 #include "hwy/base.h"
 #include "hwy/highway_export.h"
@@ -53,6 +54,20 @@ HWY_DLLEXPORT void PrintArray(const Type
                               size_t lane_u = 0, size_t max_lanes = 7);
 
 }  // namespace detail
+
+template <typename T>
+HWY_NOINLINE void PrintValue(T value) {
+  char str[100];
+  detail::ToString(hwy::detail::MakeTypeInfo<T>(), &value, str);
+  fprintf(stderr, "%s,", str);
+}
+
+template <typename T>
+HWY_NOINLINE void PrintArray(const T* value, size_t count) {
+  detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "", value, count, 0,
+                     count);
+}
+
 }  // namespace hwy
 
 #endif  // HWY_PRINT_H_
diff -pruN 0.17.0-11/hwy/print-inl.h 1.0.0-2/hwy/print-inl.h
--- 0.17.0-11/hwy/print-inl.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/print-inl.h	2022-07-27 11:48:16.000000000 +0000
@@ -34,32 +34,12 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_NOINLINE void PrintValue(T value) {
-  uint8_t byte;
-  CopyBytes<1>(&value, &byte);  // endian-safe: we ensured sizeof(T)=1.
-  fprintf(stderr, "0x%02X,", byte);
-}
-
-#if HWY_HAVE_FLOAT16
-HWY_NOINLINE void PrintValue(float16_t value) {
-  uint16_t bits;
-  CopyBytes<2>(&value, &bits);
-  fprintf(stderr, "0x%02X,", bits);
-}
-#endif
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
-HWY_NOINLINE void PrintValue(T value) {
-  fprintf(stderr, "%g,", double(value));
-}
-
 // Prints lanes around `lane`, in memory order.
 template <class D, class V = Vec<D>>
 void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
            size_t max_lanes = 7) {
-  using T = TFromD<D>;
   const size_t N = Lanes(d);
+  using T = TFromD<D>;
   auto lanes = AllocateAligned<T>(N);
   Store(v, d, lanes.get());
 
diff -pruN 0.17.0-11/hwy/targets.cc 1.0.0-2/hwy/targets.cc
--- 0.17.0-11/hwy/targets.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/targets.cc	2022-07-27 11:48:16.000000000 +0000
@@ -15,6 +15,7 @@
 
 #include "hwy/targets.h"
 
+#include <inttypes.h>  // PRIx64
 #include <stdarg.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -22,7 +23,7 @@
 
 #include <atomic>
 
-#include "hwy/base.h"
+#include "hwy/per_target.h"
 
 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
 #include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
@@ -37,7 +38,11 @@
 #else  // !HWY_COMPILER_MSVC
 #include <cpuid.h>
 #endif  // HWY_COMPILER_MSVC
-#endif  // HWY_ARCH_X86
+
+#elif HWY_ARCH_ARM && HWY_OS_LINUX
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#endif  // HWY_ARCH_*
 
 namespace hwy {
 namespace {
@@ -58,7 +63,7 @@ HWY_INLINE void Cpuid(const uint32_t lev
   for (int i = 0; i < 4; ++i) {
     abcd[i] = regs[i];
   }
-#else  // HWY_COMPILER_MSVC
+#else   // HWY_COMPILER_MSVC
   uint32_t a;
   uint32_t b;
   uint32_t c;
@@ -76,7 +81,7 @@ HWY_INLINE void Cpuid(const uint32_t lev
 uint32_t ReadXCR0() {
 #if HWY_COMPILER_MSVC
   return static_cast<uint32_t>(_xgetbv(0));
-#else  // HWY_COMPILER_MSVC
+#else   // HWY_COMPILER_MSVC
   uint32_t xcr0, xcr0_high;
   const uint32_t index = 0;
   asm volatile(".byte 0x0F, 0x01, 0xD0"
@@ -88,15 +93,12 @@ uint32_t ReadXCR0() {
 
 #endif  // HWY_ARCH_X86
 
-// Not function-local => no compiler-generated locking.
-std::atomic<uint32_t> supported_{0};  // Not yet initialized
-
 // When running tests, this value can be set to the mocked supported targets
 // mask. Only written to from a single thread before the test starts.
-uint32_t supported_targets_for_test_ = 0;
+int64_t supported_targets_for_test_ = 0;
 
 // Mask of targets disabled at runtime with DisableTargets.
-uint32_t supported_mask_{LimitsMax<uint32_t>()};
+int64_t supported_mask_ = LimitsMax<int64_t>();
 
 #if HWY_ARCH_X86
 // Arbritrary bit indices indicating which instruction set extensions are
@@ -184,77 +186,13 @@ constexpr uint64_t kGroupAVX3_DL =
 
 #endif  // HWY_ARCH_X86
 
-}  // namespace
-
-HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
-    Abort(const char* file, int line, const char* format, ...) {
-  char buf[2000];
-  va_list args;
-  va_start(args, format);
-  vsnprintf(buf, sizeof(buf), format, args);
-  va_end(args);
-
-  fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
-
-// If compiled with any sanitizer, they can also print a stack trace.
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
-  __sanitizer_print_stack_trace();
-#endif  // HWY_IS_*
-  fflush(stderr);
-
-// Now terminate the program:
-#if HWY_ARCH_RVV
-  exit(1);  // trap/abort just freeze Spike.
-#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
-  // Facilitates breaking into a debugger, but don't use this in non-debug
-  // builds because it looks like "illegal instruction", which is misleading.
-  __builtin_trap();
-#else
-  abort();  // Compile error without this due to HWY_NORETURN.
-#endif
-}
-
-HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets) {
-  supported_mask_ =
-      ~(disabled_targets & ~static_cast<uint32_t>(HWY_ENABLED_BASELINE));
-  // We can call Update() here to initialize the mask but that will trigger a
-  // call to SupportedTargets() which we use in tests to tell whether any of the
-  // highway dynamic dispatch functions were used.
-  GetChosenTarget().DeInit();
-}
-
-HWY_DLLEXPORT void SetSupportedTargetsForTest(uint32_t targets) {
-  // Reset the cached supported_ value to 0 to force a re-evaluation in the
-  // next call to SupportedTargets() which will use the mocked value set here
-  // if not zero.
-  supported_.store(0, std::memory_order_release);
-  supported_targets_for_test_ = targets;
-  GetChosenTarget().DeInit();
-}
-
-HWY_DLLEXPORT bool SupportedTargetsCalledForTest() {
-  return supported_.load(std::memory_order_acquire) != 0;
-}
-
-HWY_DLLEXPORT uint32_t SupportedTargets() {
-  uint32_t bits = supported_.load(std::memory_order_acquire);
-  // Already initialized?
-  if (HWY_LIKELY(bits != 0)) {
-    return bits & supported_mask_;
-  }
-
-  // When running tests, this allows to mock the current supported targets.
-  if (HWY_UNLIKELY(supported_targets_for_test_ != 0)) {
-    // Store the value to signal that this was used.
-    supported_.store(supported_targets_for_test_, std::memory_order_release);
-    return supported_targets_for_test_ & supported_mask_;
-  }
-
-#if defined(HWY_COMPILE_ONLY_SCALAR)
-  bits = HWY_SCALAR;
-#else
-  bits = HWY_EMU128;
-#endif
+// Returns targets supported by the CPU, independently of DisableTargets.
+// Factored out of SupportedTargets to make its structure more obvious. Note
+// that x86 CPUID may take several hundred cycles.
+int64_t DetectTargets() {
+  // Apps will use only one of these (the default is EMU128), but compile flags
+  // for this TU may differ from that of the app, so allow both.
+  int64_t bits = HWY_SCALAR | HWY_EMU128;
 
 #if HWY_ARCH_X86
   bool has_osxsave = false;
@@ -344,10 +282,58 @@ HWY_DLLEXPORT uint32_t SupportedTargets(
   }
 
   if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
-    fprintf(stderr, "WARNING: CPU supports %zx but software requires %x\n",
-            size_t(bits), HWY_ENABLED_BASELINE);
+    fprintf(stderr,
+            "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
+            "\n",
+            bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
   }
-#else
+
+#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
+  using CapBits = unsigned long;  // NOLINT
+  const CapBits hw = getauxval(AT_HWCAP);
+  (void)hw;
+
+#if HWY_ARCH_ARM_A64
+
+#if defined(HWCAP_AES)
+  // aarch64 always has NEON and VFPv4, but not necessarily AES, which we
+  // require and thus must still check for.
+  if (hw & HWCAP_AES) {
+    bits |= HWY_NEON;
+  }
+#endif  // HWCAP_AES
+
+#if defined(HWCAP_SVE)
+  if (hw & HWCAP_SVE) {
+    bits |= HWY_SVE;
+  }
+#endif
+
+#if defined(HWCAP2_SVE2) && defined(HWCAP2_SVEAES)
+  const CapBits hw2 = getauxval(AT_HWCAP2);
+  if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) {
+    bits |= HWY_SVE2;
+  }
+#endif
+
+#else  // HWY_ARCH_ARM_A64
+
+// Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported.
+// Note that AES has a different HWCAP bit compared to aarch64.
+#if defined(HWCAP_NEON) && defined(HWCAP_VFPv4)
+  if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) {
+    bits |= HWY_NEON;
+  }
+#endif
+
+#endif  // HWY_ARCH_ARM_A64
+  if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
+    fprintf(stderr,
+            "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
+            "\n",
+            bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
+  }
+#else   // HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
   // TODO(janwas): detect for other platforms and check for baseline
   // This file is typically compiled without HWY_IS_TEST, but targets_test has
   // it set, and will expect all of its HWY_TARGETS (= all attainable) to be
@@ -355,8 +341,86 @@ HWY_DLLEXPORT uint32_t SupportedTargets(
   bits |= HWY_ENABLED_BASELINE;
 #endif  // HWY_ARCH_X86
 
-  supported_.store(bits, std::memory_order_release);
-  return bits & supported_mask_;
+  return bits;
+}
+
+}  // namespace
+
+HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
+    Abort(const char* file, int line, const char* format, ...) {
+  char buf[2000];
+  va_list args;
+  va_start(args, format);
+  vsnprintf(buf, sizeof(buf), format, args);
+  va_end(args);
+
+  fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
+
+// If compiled with any sanitizer, they can also print a stack trace.
+#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+  __sanitizer_print_stack_trace();
+#endif  // HWY_IS_*
+  fflush(stderr);
+
+// Now terminate the program:
+#if HWY_ARCH_RVV
+  exit(1);  // trap/abort just freeze Spike.
+#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
+  // Facilitates breaking into a debugger, but don't use this in non-debug
+  // builds because it looks like "illegal instruction", which is misleading.
+  __builtin_trap();
+#else
+  abort();  // Compile error without this due to HWY_NORETURN.
+#endif
+}
+
+HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) {
+  supported_mask_ = static_cast<int64_t>(~disabled_targets);
+  // This will take effect on the next call to SupportedTargets, which is
+  // called right before GetChosenTarget::Update. However, calling Update here
+  // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want
+  // to check in tests. We instead de-initialize such that the next
+  // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache.
+  GetChosenTarget().DeInit();
+}
+
+HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) {
+  supported_targets_for_test_ = targets;
+  GetChosenTarget().DeInit();  // see comment above
+}
+
+HWY_DLLEXPORT int64_t SupportedTargets() {
+  int64_t targets = supported_targets_for_test_;
+  if (HWY_LIKELY(targets == 0)) {
+    // Mock not active. Re-detect instead of caching just in case we're on a
+    // heterogeneous ISA (also requires some app support to pin threads). This
+    // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to
+    // DisableTargets or SetSupportedTargetsForTest.
+    targets = DetectTargets();
+
+    // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion,
+    // first set up ChosenTarget. No need to Update() again afterwards with the
+    // final targets - that will be done by a caller of this function.
+    GetChosenTarget().Update(targets);
+
+    // Now that we can call VectorBytes, check for targets with specific sizes.
+    if (HWY_ARCH_ARM_A64) {
+      const size_t vec_bytes = VectorBytes();  // uncached, see declaration
+      if ((targets & HWY_SVE) && vec_bytes == 32) {
+        targets = static_cast<int64_t>(targets | HWY_SVE_256);
+      } else {
+        targets = static_cast<int64_t>(targets & ~HWY_SVE_256);
+      }
+      if ((targets & HWY_SVE2) && vec_bytes == 16) {
+        targets = static_cast<int64_t>(targets | HWY_SVE2_128);
+      } else {
+        targets = static_cast<int64_t>(targets & ~HWY_SVE2_128);
+      }
+    }  // HWY_ARCH_ARM_A64
+  }
+
+  targets &= supported_mask_;
+  return targets == 0 ? HWY_STATIC_TARGET : targets;
 }
 
 HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
@@ -364,14 +428,4 @@ HWY_DLLEXPORT ChosenTarget& GetChosenTar
   return chosen_target;
 }
 
-HWY_DLLEXPORT void ChosenTarget::Update() {
-  // The supported variable contains the current CPU supported targets shifted
-  // to the location expected by the ChosenTarget mask. We enabled SCALAR
-  // regardless of whether it was compiled since it is also used as the
-  // fallback mechanism to the baseline target.
-  uint32_t supported = HWY_CHOSEN_TARGET_SHIFT(hwy::SupportedTargets()) |
-                       HWY_CHOSEN_TARGET_MASK_SCALAR;
-  StoreMask(supported);
-}
-
 }  // namespace hwy
diff -pruN 0.17.0-11/hwy/targets.h 1.0.0-2/hwy/targets.h
--- 0.17.0-11/hwy/targets.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/targets.h	2022-07-27 11:48:16.000000000 +0000
@@ -31,11 +31,12 @@
 
 namespace hwy {
 
-// Returns (cached) bitfield of enabled targets that are supported on this CPU.
-// Implemented in targets.cc; unconditionally compiled to support the use case
-// of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may allow
-// eliding calls to this function.
-HWY_DLLEXPORT uint32_t SupportedTargets();
+// Returns bitfield of enabled targets that are supported on this CPU; there is
+// always at least one such target, hence the return value is never 0. The
+// targets returned may change after calling DisableTargets. This function is
+// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding
+// calls to it if there is only a single target enabled.
+HWY_DLLEXPORT int64_t SupportedTargets();
 
 // Evaluates to a function call, or literal if there is a single target.
 #if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
@@ -44,40 +45,36 @@ HWY_DLLEXPORT uint32_t SupportedTargets(
 #define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
 #endif
 
-// Disable from runtime dispatch the mask of compiled in targets. Targets that
-// were not enabled at compile time are ignored. This function is useful to
-// disable a target supported by the CPU that is known to have bugs or when a
-// lower target is desired. For this reason, attempts to disable targets which
-// are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
-// returns at least the baseline target.
-HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets);
-
-// Set the mock mask of CPU supported targets instead of the actual CPU
-// supported targets computed in SupportedTargets(). The return value of
-// SupportedTargets() will still be affected by the DisableTargets() mask
-// regardless of this mock, to prevent accidentally adding targets that are
-// known to be buggy in the current CPU. Call with a mask of 0 to disable the
-// mock and use the actual CPU supported targets instead.
-HWY_DLLEXPORT void SetSupportedTargetsForTest(uint32_t targets);
-
-// Returns whether the SupportedTargets() function was called since the last
-// SetSupportedTargetsForTest() call.
-HWY_DLLEXPORT bool SupportedTargetsCalledForTest();
+// Subsequent SupportedTargets will not return targets whose bit(s) are set in
+// `disabled_targets`. Exception: if SupportedTargets would return 0, it will
+// instead return HWY_STATIC_TARGET (there must always be one target to call).
+//
+// This function is useful for disabling targets known to be buggy, or if the
+// best available target is undesirable (perhaps due to throttling or memory
+// bandwidth limitations). Use SetSupportedTargetsForTest instead of this
+// function for iteratively enabling specific targets for testing.
+HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
+
+// Subsequent SupportedTargets will return the given set of targets, except
+// those disabled via DisableTargets. Call with a mask of 0 to disable the mock
+// and return to the normal SupportedTargets behavior. Used to run tests for
+// all targets.
+HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);
 
 // Return the list of targets in HWY_TARGETS supported by the CPU as a list of
 // individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
 // is affected by the current SetSupportedTargetsForTest() mock if any.
-HWY_INLINE std::vector<uint32_t> SupportedAndGeneratedTargets() {
-  std::vector<uint32_t> ret;
-  for (uint32_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
+HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
+  std::vector<int64_t> ret;
+  for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
        targets = targets & (targets - 1)) {
-    uint32_t current_target = targets & ~(targets - 1);
+    int64_t current_target = targets & ~(targets - 1);
     ret.push_back(current_target);
   }
   return ret;
 }
 
-static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
+static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
   switch (target) {
 #if HWY_ARCH_X86
     case HWY_SSSE3:
@@ -93,22 +90,28 @@ static inline HWY_MAYBE_UNUSED const cha
 #endif
 
 #if HWY_ARCH_ARM
+    case HWY_SVE2_128:
+      return "SVE2_128";
+    case HWY_SVE_256:
+      return "SVE_256";
     case HWY_SVE2:
       return "SVE2";
     case HWY_SVE:
       return "SVE";
     case HWY_NEON:
-      return "Neon";
+      return "NEON";
 #endif
 
 #if HWY_ARCH_PPC
     case HWY_PPC8:
-      return "Power8";
+      return "PPC8";
 #endif
 
 #if HWY_ARCH_WASM
     case HWY_WASM:
-      return "Wasm";
+      return "WASM";
+    case HWY_WASM_EMU256:
+      return "WASM_EMU256";
 #endif
 
 #if HWY_ARCH_RVV
@@ -117,9 +120,9 @@ static inline HWY_MAYBE_UNUSED const cha
 #endif
 
     case HWY_EMU128:
-      return "Emu128";
+      return "EMU128";
     case HWY_SCALAR:
-      return "Scalar";
+      return "SCALAR";
 
     default:
       return "Unknown";  // must satisfy gtest IsValidParamName()
@@ -132,7 +135,7 @@ static inline HWY_MAYBE_UNUSED const cha
 // For the ChosenTarget mask and index we use a different bit arrangement than
 // in the HWY_TARGETS mask. Only the targets involved in the current
 // architecture are used in this mask, and therefore only the least significant
-// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the uint32_t mask are used. The least
+// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least
 // significant bit is set when the mask is not initialized, the next
 // HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
 // HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
@@ -146,81 +149,111 @@ static inline HWY_MAYBE_UNUSED const cha
 // scalar.
 
 // The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format.
-#define HWY_CHOSEN_TARGET_MASK_SCALAR (1u << (HWY_MAX_DYNAMIC_TARGETS + 1))
+#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1))
 
 // Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
 // current architecture.
 #define HWY_CHOSEN_TARGET_SHIFT(X)                                    \
   ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
-    ((1u << HWY_MAX_DYNAMIC_TARGETS) - 1))                            \
+    ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1))                           \
    << 1)
 
 // The HWY_TARGETS mask in the ChosenTarget mask format.
 #define HWY_CHOSEN_TARGET_MASK_TARGETS \
-  (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1u)
+  (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL)
 
 #if HWY_ARCH_X86
 // Maximum number of dynamic targets, changing this value is an ABI incompatible
 // change
-#define HWY_MAX_DYNAMIC_TARGETS 10
+#define HWY_MAX_DYNAMIC_TARGETS 15
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
 // These must match the order in which the HWY_TARGETS are defined
 // starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
 // HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
 // HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
 // corresponds to the best target. Don't include a "," at the end of the list.
-#define HWY_CHOOSE_TARGET_LIST(func_name)           \
-  nullptr,                           /* reserved */ \
-      nullptr,                       /* reserved */ \
-      HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */  \
-      HWY_CHOOSE_AVX3(func_name),    /* AVX3 */     \
-      HWY_CHOOSE_AVX2(func_name),    /* AVX2 */     \
-      nullptr,                       /* AVX */      \
-      HWY_CHOOSE_SSE4(func_name),    /* SSE4 */     \
-      HWY_CHOOSE_SSSE3(func_name),   /* SSSE3 */    \
-      nullptr,                       /* SSE3 */     \
-      nullptr                        /* SSE2 */
+#define HWY_CHOOSE_TARGET_LIST(func_name)                   \
+  nullptr,                           /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */          \
+      HWY_CHOOSE_AVX3(func_name),    /* AVX3 */             \
+      HWY_CHOOSE_AVX2(func_name),    /* AVX2 */             \
+      nullptr,                       /* AVX */              \
+      HWY_CHOOSE_SSE4(func_name),    /* SSE4 */             \
+      HWY_CHOOSE_SSSE3(func_name),   /* SSSE3 */            \
+      nullptr ,                       /* reserved - SSE3? */ \
+      nullptr                        /* reserved - SSE2? */
 
 #elif HWY_ARCH_ARM
 // See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 4
+#define HWY_MAX_DYNAMIC_TARGETS 15
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
+#define HWY_CHOOSE_TARGET_LIST(func_name)                \
+  nullptr,                            /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \
+      HWY_CHOOSE_SVE_256(func_name),  /* SVE 256-bit */  \
+      HWY_CHOOSE_SVE2(func_name),     /* SVE2 */         \
+      HWY_CHOOSE_SVE(func_name),      /* SVE */          \
+      HWY_CHOOSE_NEON(func_name),     /* NEON */         \
+      nullptr                         /* reserved - Helium? */
+
+#elif HWY_ARCH_RVV
+// See HWY_ARCH_X86 above for details.
+#define HWY_MAX_DYNAMIC_TARGETS 9
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
 #define HWY_CHOOSE_TARGET_LIST(func_name)       \
-  HWY_CHOOSE_SVE2(func_name),    /* SVE2 */     \
-      HWY_CHOOSE_SVE(func_name), /* SVE */      \
+  nullptr,                       /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
       nullptr,                   /* reserved */ \
-      HWY_CHOOSE_NEON(func_name) /* NEON */
+      HWY_CHOOSE_RVV(func_name), /* RVV */      \
+      nullptr                    /* reserved */
 
 #elif HWY_ARCH_PPC
 // See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 5
+#define HWY_MAX_DYNAMIC_TARGETS 9
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
-#define HWY_CHOOSE_TARGET_LIST(func_name)        \
-  nullptr,                        /* reserved */ \
-      nullptr,                    /* reserved */ \
-      HWY_CHOOSE_PPC8(func_name), /* PPC8 */     \
-      nullptr,                    /* VSX */      \
-      nullptr                     /* AltiVec */
+#define HWY_CHOOSE_TARGET_LIST(func_name)                         \
+  nullptr,                        /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      HWY_CHOOSE_PPC8(func_name), /* PPC8 */                      \
+      nullptr,                    /* reserved (VSX or AltiVec) */ \
+      nullptr                     /* reserved (VSX or AltiVec) */
 
 #elif HWY_ARCH_WASM
 // See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 4
+#define HWY_MAX_DYNAMIC_TARGETS 9
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
-#define HWY_CHOOSE_TARGET_LIST(func_name)         \
-  nullptr,                         /* reserved */ \
-      nullptr,                     /* reserved */ \
-      HWY_CHOOSE_WASM2(func_name), /* WASM2 */    \
-      HWY_CHOOSE_WASM(func_name)   /* WASM */
-
-#elif HWY_ARCH_RVV
-// See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 4
-#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
-#define HWY_CHOOSE_TARGET_LIST(func_name)       \
-  nullptr,                       /* reserved */ \
-      nullptr,                   /* reserved */ \
-      nullptr,                   /* reserved */ \
-      HWY_CHOOSE_RVV(func_name) /* RVV */
+#define HWY_CHOOSE_TARGET_LIST(func_name)                  \
+  nullptr,                               /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \
+      HWY_CHOOSE_WASM(func_name),        /* WASM */        \
+      nullptr                            /* reserved */
 
 #else
 // Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
@@ -229,41 +262,51 @@ static inline HWY_MAYBE_UNUSED const cha
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
 #endif
 
+// Bitfield of supported and enabled targets. The format differs from that of
+// HWY_TARGETS; the lowest bit governs the first function pointer (which is
+// special in that it calls FunctionCache, then Update, then dispatches to the
+// actual implementation) in the tables created by HWY_EXPORT. Monostate (see
+// GetChosenTarget), thread-safe except on RVV.
 struct ChosenTarget {
  public:
-  // Update the ChosenTarget mask based on the current CPU supported
-  // targets.
-  HWY_DLLEXPORT void Update();
+  // Reset bits according to `targets` (typically the return value of
+  // SupportedTargets()). Postcondition: IsInitialized() == true.
+  void Update(int64_t targets) {
+    // These are `targets` shifted downwards, see above. Also include SCALAR
+    // (corresponds to the last entry in the function table) as fallback.
+    StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR);
+  }
 
-  // Reset the ChosenTarget to the uninitialized state.
+  // Reset to the uninitialized state, so that FunctionCache will call Update
+  // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false.
   void DeInit() { StoreMask(1); }
 
-  // Whether the ChosenTarget was initialized. This is useful to know whether
-  // any HWY_DYNAMIC_DISPATCH function was called.
+  // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH
+  // function was called, which we check in tests.
   bool IsInitialized() const { return LoadMask() != 1; }
 
   // Return the index in the dynamic dispatch table to be used by the current
   // CPU. Note that this method must be in the header file so it uses the value
   // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
-  // calls it, which may be different from others. This allows to only consider
+  // calls it, which may be different from others. This means we only enable
   // those targets that were actually compiled in this module.
   size_t HWY_INLINE GetIndex() const {
-    return hwy::Num0BitsBelowLS1Bit_Nonzero32(LoadMask() &
-                                              HWY_CHOSEN_TARGET_MASK_TARGETS);
+    return hwy::Num0BitsBelowLS1Bit_Nonzero64(
+        static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS));
   }
 
  private:
   // TODO(janwas): remove #if once <atomic> is available
 #if HWY_ARCH_RVV
-  uint32_t LoadMask() const { return mask_; }
-  void StoreMask(uint32_t mask) { mask_ = mask; }
+  int64_t LoadMask() const { return mask_; }
+  void StoreMask(int64_t mask) { mask_ = mask; }
 
-  uint32_t mask_{1};  // Initialized to 1 so GetIndex() returns 0.
+  int64_t mask_{1};  // Initialized to 1 so GetIndex() returns 0.
 #else
-  uint32_t LoadMask() const { return mask_.load(); }
-  void StoreMask(uint32_t mask) { mask_.store(mask); }
+  int64_t LoadMask() const { return mask_.load(); }
+  void StoreMask(int64_t mask) { mask_.store(mask); }
 
-  std::atomic<uint32_t> mask_{1};  // Initialized to 1 so GetIndex() returns 0.
+  std::atomic<int64_t> mask_{1};  // Initialized to 1 so GetIndex() returns 0.
 #endif  // HWY_ARCH_RVV
 };
 
diff -pruN 0.17.0-11/hwy/targets_test.cc 1.0.0-2/hwy/targets_test.cc
--- 0.17.0-11/hwy/targets_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/targets_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -22,7 +22,7 @@ namespace fake {
 #define DECLARE_FUNCTION(TGT)                                                \
   namespace N_##TGT {                                                        \
     /* Function argument is just to ensure/demonstrate they are possible. */ \
-    uint32_t FakeFunction(int) { return HWY_##TGT; }                         \
+    int64_t FakeFunction(int) { return HWY_##TGT; }                          \
   }
 
 DECLARE_FUNCTION(AVX3_DL)
@@ -33,6 +33,8 @@ DECLARE_FUNCTION(SSSE3)
 DECLARE_FUNCTION(NEON)
 DECLARE_FUNCTION(SVE)
 DECLARE_FUNCTION(SVE2)
+DECLARE_FUNCTION(SVE_256)
+DECLARE_FUNCTION(SVE2_128)
 DECLARE_FUNCTION(PPC8)
 DECLARE_FUNCTION(WASM)
 DECLARE_FUNCTION(RVV)
@@ -41,13 +43,13 @@ DECLARE_FUNCTION(EMU128)
 
 HWY_EXPORT(FakeFunction);
 
-void CallFunctionForTarget(uint32_t target, int line) {
+void CallFunctionForTarget(int64_t target, int line) {
   if ((HWY_TARGETS & target) == 0) return;
   hwy::SetSupportedTargetsForTest(target);
 
   // Call Update() first to make &HWY_DYNAMIC_DISPATCH() return
   // the pointer to the already cached function.
-  hwy::GetChosenTarget().Update();
+  hwy::GetChosenTarget().Update(hwy::SupportedTargets());
 
   EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
 
@@ -55,7 +57,11 @@ void CallFunctionForTarget(uint32_t targ
   // also calls the right function.
   hwy::GetChosenTarget().DeInit();
 
+#if HWY_DISPATCH_WORKAROUND
+  EXPECT_EQ(HWY_STATIC_TARGET, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
+#else
   EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
+#endif
 
   // Second call uses the cached value from the previous call.
   EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
@@ -71,12 +77,14 @@ void CheckFakeFunction() {
   CallFunctionForTarget(HWY_NEON, __LINE__);
   CallFunctionForTarget(HWY_SVE, __LINE__);
   CallFunctionForTarget(HWY_SVE2, __LINE__);
+  CallFunctionForTarget(HWY_SVE_256, __LINE__);
+  CallFunctionForTarget(HWY_SVE2_128, __LINE__);
   CallFunctionForTarget(HWY_PPC8, __LINE__);
   CallFunctionForTarget(HWY_WASM, __LINE__);
   CallFunctionForTarget(HWY_RVV, __LINE__);
   // The tables only have space for either HWY_SCALAR or HWY_EMU128; the former
   // is opt-in only.
-#if defined(HWY_COMPILE_ONLY_SCALAR)
+#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
   CallFunctionForTarget(HWY_SCALAR, __LINE__);
 #else
   CallFunctionForTarget(HWY_EMU128, __LINE__);
@@ -101,25 +109,22 @@ class HwyTargetsTest : public testing::T
 TEST_F(HwyTargetsTest, ChosenTargetOrderTest) { fake::CheckFakeFunction(); }
 
 TEST_F(HwyTargetsTest, DisabledTargetsTest) {
-  DisableTargets(~0u);
-#if HWY_ARCH_X86
-  // Check that the baseline can't be disabled.
-  HWY_ASSERT(HWY_ENABLED_BASELINE == SupportedTargets());
-#else
-  // TODO(janwas): update when targets.cc changes
-  HWY_ASSERT(HWY_TARGETS == SupportedTargets());
-#endif
+  DisableTargets(~0LL);
+  // Check that disabling everything at least leaves the static target.
+  HWY_ASSERT(HWY_STATIC_TARGET == SupportedTargets());
 
   DisableTargets(0);  // Reset the mask.
-  uint32_t current_targets = SupportedTargets();
-  if ((current_targets & ~static_cast<uint32_t>(HWY_ENABLED_BASELINE)) == 0) {
+  const int64_t current_targets = SupportedTargets();
+  const int64_t enabled_baseline = static_cast<int64_t>(HWY_ENABLED_BASELINE);
+  // Exclude these two because they are always returned by SupportedTargets.
+  const int64_t fallback = HWY_SCALAR | HWY_EMU128;
+  if ((current_targets & ~enabled_baseline & ~fallback) == 0) {
     // We can't test anything else if the only compiled target is the baseline.
     return;
   }
+
   // Get the lowest bit in the mask (the best target) and disable that one.
-  uint32_t best_target = current_targets & (~current_targets + 1);
-  // The lowest target shouldn't be one in the baseline.
-  HWY_ASSERT((best_target & ~static_cast<uint32_t>(HWY_ENABLED_BASELINE)) != 0);
+  const int64_t best_target = current_targets & (~current_targets + 1);
   DisableTargets(best_target);
 
   // Check that the other targets are still enabled.
diff -pruN 0.17.0-11/hwy/tests/arithmetic_test.cc 1.0.0-2/hwy/tests/arithmetic_test.cc
--- 0.17.0-11/hwy/tests/arithmetic_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/arithmetic_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -19,7 +19,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/arithmetic_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -175,6 +175,23 @@ HWY_NOINLINE void TestAllAbs() {
   ForFloatTypes(ForPartialVectors<TestFloatAbs>());
 }
 
+struct TestNeg {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vn = Set(d, T(-3));
+    const auto vp = Set(d, T(3));
+    HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
+    HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
+    HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
+  }
+};
+
+HWY_NOINLINE void TestAllNeg() {
+  ForSignedTypes(ForPartialVectors<TestNeg>());
+  ForFloatTypes(ForPartialVectors<TestNeg>());
+}
+
 struct TestUnsignedMinMax {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -261,16 +278,15 @@ HWY_NOINLINE void TestAllMinMax() {
   ForFloatTypes(ForPartialVectors<TestFloatMinMax>());
 }
 
-class TestMinMax128 {
-  template <class D>
-  static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
-    alignas(16) uint64_t in[2];
-    in[0] = lo;
-    in[1] = hi;
-    return LoadDup128(d, in);
-  }
+template <class D>
+static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
+  alignas(16) uint64_t in[2];
+  in[0] = lo;
+  in[1] = hi;
+  return LoadDup128(d, in);
+}
 
- public:
+struct TestMinMax128 {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
     using V = Vec<D>;
@@ -339,149 +355,73 @@ HWY_NOINLINE void TestAllMinMax128() {
   ForGEVectors<128, TestMinMax128>()(uint64_t());
 }
 
-
-struct TestSumOfLanes {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    // Lane i = bit i, higher lanes 0
-    double sum = 0.0;
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
-      sum += static_cast<double>(in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
-                      SumOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = i (iota) to include upper lanes
-    sum = 0.0;
-    for (size_t i = 0; i < N; ++i) {
-      sum += static_cast<double>(i);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
-  }
-};
-
-HWY_NOINLINE void TestAllSumOfLanes() {
-  ForUIF3264(ForPartialVectors<TestSumOfLanes>());
-}
-
-struct TestMinOfLanes {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    // Lane i = bit i, higher lanes = 2 (not the minimum)
-    T min = HighestValue<T>();
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
-      min = HWY_MIN(min, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = N - i to include upper lanes
-    min = HighestValue<T>();
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = static_cast<T>(N - i);  // no 8-bit T so no wraparound
-      min = HWY_MIN(min, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
-  }
-};
-
-struct TestMaxOfLanes {
+struct TestMinMax128Upper {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
     const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    T max = LowestValue<T>();
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
-      max = HWY_MAX(max, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = i to include upper lanes
-    max = LowestValue<T>();
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = static_cast<T>(i);  // no 8-bit T so no wraparound
-      max = HWY_MAX(max, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
-  }
-};
+    auto a_lanes = AllocateAligned<T>(N);
+    auto b_lanes = AllocateAligned<T>(N);
+    auto min_lanes = AllocateAligned<T>(N);
+    auto max_lanes = AllocateAligned<T>(N);
+    RandomState rng;
 
-HWY_NOINLINE void TestAllMinMaxOfLanes() {
-  const ForPartialVectors<TestMinOfLanes> test_min;
-  const ForPartialVectors<TestMaxOfLanes> test_max;
-  ForUIF3264(test_min);
-  ForUIF3264(test_max);
-  test_min(uint16_t());
-  test_max(uint16_t());
-  test_min(int16_t());
-  test_max(int16_t());
-}
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
 
-struct TestSumsOf8 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
+    // Same arg
+    HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v00, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v10, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v11, v11));
+    HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v00, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v01, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v11, v11));
+
+    // Equivalent but not equal (chooses second arg)
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v00, v01));
+    HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v10, v11));
+    HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v01, v00));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v11, v10));
+    HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v01, v00));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v11, v10));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v00, v01));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v10, v11));
 
-    const size_t N = Lanes(d);
-    if (N < 8) return;
-    const Repartition<uint64_t, D> du64;
+    // First arg less
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v10));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v01, v10));
 
-    auto in_lanes = AllocateAligned<T>(N);
-    auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
+    // Second arg less
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v10, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v01));
 
-    for (size_t rep = 0; rep < 100; ++rep) {
+    // Also check 128-bit blocks are independent
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
       for (size_t i = 0; i < N; ++i) {
-        in_lanes[i] = Random64(&rng) & 0xFF;
+        a_lanes[i] = Random64(&rng);
+        b_lanes[i] = Random64(&rng);
       }
-
-      for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
-        uint64_t sum = 0;
-        for (size_t i = 0; i < 8; ++i) {
-          sum += in_lanes[idx_sum * 8 + i];
-        }
-        sum_lanes[idx_sum] = sum;
+      const V a = Load(d, a_lanes.get());
+      const V b = Load(d, b_lanes.get());
+      for (size_t i = 0; i < N; i += 2) {
+        const bool lt = a_lanes[i + 1] < b_lanes[i + 1];
+        min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
+        min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
+        max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
+        max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
       }
-
-      const Vec<D> in = Load(d, in_lanes.get());
-      HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
+      HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128Upper(d, a, b));
+      HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128Upper(d, a, b));
     }
   }
 };
 
-HWY_NOINLINE void TestAllSumsOf8() {
-  ForGEVectors<64, TestSumsOf8>()(uint8_t());
-}
-
-struct TestNeg {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vn = Set(d, T(-3));
-    const auto vp = Set(d, T(3));
-    HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
-    HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
-    HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
-  }
-};
-
-HWY_NOINLINE void TestAllNeg() {
-  ForSignedTypes(ForPartialVectors<TestNeg>());
-  ForFloatTypes(ForPartialVectors<TestNeg>());
+HWY_NOINLINE void TestAllMinMax128Upper() {
+  ForGEVectors<128, TestMinMax128Upper>()(uint64_t());
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -495,14 +435,12 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyArithmeticTest);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumOfLanes);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumsOf8);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128Upper);
 }  // namespace hwy
 
 #endif
diff -pruN 0.17.0-11/hwy/tests/blockwise_shift_test.cc 1.0.0-2/hwy/tests/blockwise_shift_test.cc
--- 0.17.0-11/hwy/tests/blockwise_shift_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/blockwise_shift_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -19,7 +19,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -226,7 +226,8 @@ struct TestCombineShiftRight {
   HWY_NOINLINE void operator()(T t, D d) {
 // Scalar does not define CombineShiftRightBytes.
 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
-    constexpr int kMaxBytes = HWY_MIN(16, int(MaxLanes(d) * sizeof(T)));
+    constexpr int kMaxBytes =
+        HWY_MIN(16, static_cast<int>(MaxLanes(d) * sizeof(T)));
     constexpr int kMaxLanes = kMaxBytes / static_cast<int>(sizeof(T));
     TestCombineShiftRightBytes<kMaxBytes - 1>()(t, d);
     TestCombineShiftRightBytes<HWY_MAX(kMaxBytes / 2, 1)>()(t, d);
diff -pruN 0.17.0-11/hwy/tests/blockwise_test.cc 1.0.0-2/hwy/tests/blockwise_test.cc
--- 0.17.0-11/hwy/tests/blockwise_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/blockwise_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -19,7 +19,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -147,7 +147,7 @@ struct TestTableLookupBytes {
       const uint8_t prev_index = index_bytes[i];
       expected_bytes[i] = 0;
 
-      const int idx = 0x80 + (int(Random32(&rng) & 7) << 4);
+      const int idx = 0x80 + (static_cast<int>(Random32(&rng) & 7) << 4);
       HWY_ASSERT(0x80 <= idx && idx < 256);
       index_bytes[i] = static_cast<uint8_t>(idx);
 
@@ -248,25 +248,54 @@ struct TestZipLower {
     const auto even = Load(d, even_lanes.get());
     const auto odd = Load(d, odd_lanes.get());
 
+    const Repartition<WideT, D> dw;
+#if HWY_TARGET == HWY_SCALAR
+    // Safely handle big-endian
+    const auto expected = Set(dw, static_cast<WideT>(1ULL << (sizeof(T) * 8)));
+#else
     const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
-
     for (size_t i = 0; i < N; i += 2) {
       const size_t base = (i / blockN) * blockN;
       const size_t mod = i % blockN;
       zip_lanes[i + 0] = even_lanes[mod / 2 + base];
       zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
     }
-    const Repartition<WideT, D> dw;
     const auto expected =
         Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
+#endif  // HWY_TARGET == HWY_SCALAR
     HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd));
     HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd));
   }
 };
 
+HWY_NOINLINE void TestAllZipLower() {
+  const ForDemoteVectors<TestZipLower> lower_unsigned;
+  lower_unsigned(uint8_t());
+  lower_unsigned(uint16_t());
+#if HWY_HAVE_INTEGER64
+  lower_unsigned(uint32_t());  // generates u64
+#endif
+
+  const ForDemoteVectors<TestZipLower> lower_signed;
+  lower_signed(int8_t());
+  lower_signed(int16_t());
+#if HWY_HAVE_INTEGER64
+  lower_signed(int32_t());  // generates i64
+#endif
+
+  // No float - concatenating f32 does not result in a f64
+}
+
+// Remove this test (so it does not show as having run) if the only target is
+// HWY_SCALAR, which does not support this op.
+#if HWY_TARGETS != HWY_SCALAR
+
 struct TestZipUpper {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_SCALAR
+    (void)d;
+#else
     using WideT = MakeWide<T>;
     static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
     static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
@@ -295,24 +324,11 @@ struct TestZipUpper {
     const auto expected =
         Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
     HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd));
+#endif  // HWY_TARGET == HWY_SCALAR
   }
 };
 
-HWY_NOINLINE void TestAllZip() {
-  const ForDemoteVectors<TestZipLower> lower_unsigned;
-  lower_unsigned(uint8_t());
-  lower_unsigned(uint16_t());
-#if HWY_HAVE_INTEGER64
-  lower_unsigned(uint32_t());  // generates u64
-#endif
-
-  const ForDemoteVectors<TestZipLower> lower_signed;
-  lower_signed(int8_t());
-  lower_signed(int16_t());
-#if HWY_HAVE_INTEGER64
-  lower_signed(int32_t());  // generates i64
-#endif
-
+HWY_NOINLINE void TestAllZipUpper() {
   const ForShrinkableVectors<TestZipUpper> upper_unsigned;
   upper_unsigned(uint8_t());
   upper_unsigned(uint16_t());
@@ -330,6 +346,8 @@ HWY_NOINLINE void TestAllZip() {
   // No float - concatenating f32 does not result in a f64
 }
 
+#endif  // HWY_TARGETS != HWY_SCALAR
+
 class TestSpecialShuffle32 {
  public:
   template <class T, class D>
@@ -424,7 +442,10 @@ HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest,
 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame);
 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed);
 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZip);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower);
+#if HWY_TARGETS != HWY_SCALAR
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper);
+#endif
 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles);
 }  // namespace hwy
 
diff -pruN 0.17.0-11/hwy/tests/combine_test.cc 1.0.0-2/hwy/tests/combine_test.cc
--- 0.17.0-11/hwy/tests/combine_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/combine_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -18,8 +18,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/combine_test.cc"
-#include "hwy/foreach_target.h"
-
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -235,6 +234,14 @@ struct TestConcatOddEven {
     const auto odd = Add(even, Set(d, 1));
     HWY_ASSERT_VEC_EQ(d, odd, ConcatOdd(d, hi, lo));
     HWY_ASSERT_VEC_EQ(d, even, ConcatEven(d, hi, lo));
+
+    // This test catches inadvertent saturation.
+    const auto min = Set(d, LowestValue<T>());
+    const auto max = Set(d, HighestValue<T>());
+    HWY_ASSERT_VEC_EQ(d, max, ConcatOdd(d, max, max));
+    HWY_ASSERT_VEC_EQ(d, max, ConcatEven(d, max, max));
+    HWY_ASSERT_VEC_EQ(d, min, ConcatOdd(d, min, min));
+    HWY_ASSERT_VEC_EQ(d, min, ConcatEven(d, min, min));
 #else
     (void)d;
 #endif
diff -pruN 0.17.0-11/hwy/tests/compare_test.cc 1.0.0-2/hwy/tests/compare_test.cc
--- 0.17.0-11/hwy/tests/compare_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/compare_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -19,7 +19,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/compare_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -149,8 +149,21 @@ struct TestStrictInt {
   }
 };
 
+// S-SSE3 bug (#795): same upper, differing MSB in lower
+struct TestStrictInt64 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto m0 = MaskFalse(d);
+    const auto m1 = MaskTrue(d);
+    HWY_ASSERT_MASK_EQ(d, m0, Lt(Set(d, 0x380000000LL), Set(d, 0x300000001LL)));
+    HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000000LL)));
+    HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000001LL)));
+  }
+};
+
 HWY_NOINLINE void TestAllStrictInt() {
   ForSignedTypes(ForPartialVectors<TestStrictInt>());
+  ForPartialVectors<TestStrictInt64>()(int64_t());
 }
 
 struct TestStrictFloat {
@@ -219,16 +232,15 @@ HWY_NOINLINE void TestAllWeakFloat() {
   ForFloatTypes(ForPartialVectors<TestWeakFloat>());
 }
 
-class TestLt128 {
-  template <class D>
-  static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
-    alignas(16) uint64_t in[2];
-    in[0] = lo;
-    in[1] = hi;
-    return LoadDup128(d, in);
-  }
+template <class D>
+static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
+  alignas(16) uint64_t in[2];
+  in[0] = lo;
+  in[1] = hi;
+  return LoadDup128(d, in);
+}
 
- public:
+struct TestLt128 {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
     using V = Vec<D>;
@@ -276,6 +288,56 @@ class TestLt128 {
 
 HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); }
 
+struct TestLt128Upper {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v10));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v11));
+
+    // Reversed order
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v11, v01));
+
+    // Also check 128-bit blocks are independent
+    const V iota = Iota(d, 1);
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v10), iota));
+
+    // Max value
+    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v11, vm));
+  }
+};
+
+HWY_NOINLINE void TestAllLt128Upper() {
+  ForGEVectors<128, TestLt128Upper>()(uint64_t());
+}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
@@ -291,6 +353,7 @@ HWY_EXPORT_AND_TEST_P(HwyCompareTest, Te
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128Upper);
 }  // namespace hwy
 
 #endif
diff -pruN 0.17.0-11/hwy/tests/compress_test.cc 1.0.0-2/hwy/tests/compress_test.cc
--- 0.17.0-11/hwy/tests/compress_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/compress_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -24,7 +24,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/compress_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -32,41 +32,41 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 
-// For regenerating tables used in the implementation
+// Regenerate tables used in the implementation, instead of testing.
 #define HWY_PRINT_TABLES 0
 
-class TestCompress {
-  template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
-  void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
-                   size_t num_to_check, const AlignedFreeUniquePtr<T[]>& in,
-                   const AlignedFreeUniquePtr<TI[]>& mask_lanes,
-                   const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
-                   int line) {
-    if (expected_pos != actual_pos) {
-      hwy::Abort(
-          __FILE__, line,
-          "Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
-          TypeName(T(), Lanes(d)).c_str(), static_cast<uint64_t>(expected_pos),
-          static_cast<uint64_t>(actual_pos));
-    }
-    // Modified from AssertVecEqual - we may not be checking all lanes.
-    for (size_t i = 0; i < num_to_check; ++i) {
-      if (!IsEqual(expected[i], actual_u[i])) {
-        const size_t N = Lanes(d);
-        fprintf(stderr,
-                "Mismatch at i=%" PRIu64 " of %" PRIu64 ", line %d:\n\n",
-                static_cast<uint64_t>(i), static_cast<uint64_t>(num_to_check),
-                line);
-        Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
-        Print(d, "in", Load(d, in.get()), 0, N);
-        Print(d, "expect", Load(d, expected.get()), 0, N);
-        Print(d, "actual", Load(d, actual_u), 0, N);
-        HWY_ASSERT(false);
-      }
+#if !HWY_PRINT_TABLES || HWY_IDE
+
+template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
+void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
+                 size_t num_to_check, const AlignedFreeUniquePtr<T[]>& in,
+                 const AlignedFreeUniquePtr<TI[]>& mask_lanes,
+                 const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
+                 int line) {
+  if (expected_pos != actual_pos) {
+    hwy::Abort(
+        __FILE__, line,
+        "Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
+        TypeName(T(), Lanes(d)).c_str(), static_cast<uint64_t>(expected_pos),
+        static_cast<uint64_t>(actual_pos));
+  }
+  // Modified from AssertVecEqual - we may not be checking all lanes.
+  for (size_t i = 0; i < num_to_check; ++i) {
+    if (!IsEqual(expected[i], actual_u[i])) {
+      const size_t N = Lanes(d);
+      fprintf(stderr, "Mismatch at i=%" PRIu64 " of %" PRIu64 ", line %d:\n\n",
+              static_cast<uint64_t>(i), static_cast<uint64_t>(num_to_check),
+              line);
+      Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
+      Print(d, "in", Load(d, in.get()), 0, N);
+      Print(d, "expect", Load(d, expected.get()), 0, N);
+      Print(d, "actual", Load(d, actual_u), 0, N);
+      HWY_ASSERT(false);
     }
   }
+}
 
- public:
+struct TestCompress {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
     RandomState rng;
@@ -131,19 +131,25 @@ class TestCompress {
         CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
                     mask_lanes, expected, actual_u, __LINE__);
 
+        // CompressNot
+        memset(actual_u, 0, N * sizeof(T));
+        StoreU(CompressNot(in, Not(mask)), d, actual_u);
+        CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
+                    mask_lanes, expected, actual_u, __LINE__);
+
         // CompressStore
         memset(actual_u, 0, N * sizeof(T));
         const size_t size1 = CompressStore(in, mask, d, actual_u);
-        // expected_pos instead of num_to_check because this op is not affected
-        // by CompressIsPartition.
+        // expected_pos instead of num_to_check because this op is not
+        // affected by CompressIsPartition.
         CheckStored(d, di, expected_pos, size1, expected_pos, in_lanes,
                     mask_lanes, expected, actual_u, __LINE__);
 
         // CompressBlendedStore
         memset(actual_u, 0, N * sizeof(T));
         const size_t size2 = CompressBlendedStore(in, mask, d, actual_u);
-        // expected_pos instead of num_to_check because this op only writes the
-        // mask=true lanes.
+        // expected_pos instead of num_to_check because this op only writes
+        // the mask=true lanes.
         CheckStored(d, di, expected_pos, size2, expected_pos, in_lanes,
                     mask_lanes, expected, actual_u, __LINE__);
         // Subsequent lanes are untouched.
@@ -160,8 +166,8 @@ class TestCompress {
         // CompressBitsStore
         memset(actual_u, 0, N * sizeof(T));
         const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u);
-        // expected_pos instead of num_to_check because this op is not affected
-        // by CompressIsPartition.
+        // expected_pos instead of num_to_check because this op is not
+        // affected by CompressIsPartition.
         CheckStored(d, di, expected_pos, size3, expected_pos, in_lanes,
                     mask_lanes, expected, actual_u, __LINE__);
       }  // rep
@@ -169,8 +175,81 @@ class TestCompress {
   }      // operator()
 };
 
-#if HWY_PRINT_TABLES
+HWY_NOINLINE void TestAllCompress() {
+  ForUIF163264(ForPartialVectors<TestCompress>());
+}
+
+struct TestCompressBlocks {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_SCALAR
+    (void)d;
+#else
+    static_assert(sizeof(T) == 8 && !IsSigned<T>(), "Should be u64");
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+
+    auto in_lanes = AllocateAligned<T>(N);
+    auto mask_lanes = AllocateAligned<TI>(N);
+    auto expected = AllocateAligned<T>(N);
+    auto actual = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      size_t expected_pos = 0;
+      for (size_t i = 0; i < N; i += 2) {
+        const uint64_t bits = Random32(&rng);
+        in_lanes[i + 1] = in_lanes[i] = T();  // cannot set float16_t directly.
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]);
+        mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0};
+        if (mask_lanes[i] > 0) {
+          expected[expected_pos++] = in_lanes[i];
+          expected[expected_pos++] = in_lanes[i + 1];
+        }
+      }
+      size_t num_to_check;
+      if (CompressIsPartition<T>::value) {
+        // For non-native Compress, also check that mask=false lanes were
+        // moved to the back of the vector (highest indices).
+        size_t extra = expected_pos;
+        for (size_t i = 0; i < N; ++i) {
+          if (mask_lanes[i] == 0) {
+            expected[extra++] = in_lanes[i];
+          }
+        }
+        HWY_ASSERT(extra == N);
+        num_to_check = N;
+      } else {
+        // For native Compress, only the mask=true lanes are defined.
+        num_to_check = expected_pos;
+      }
+
+      const auto in = Load(d, in_lanes.get());
+      const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
+
+      // CompressBlocksNot
+      memset(actual.get(), 0, N * sizeof(T));
+      StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get());
+      CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
+                  mask_lanes, expected, actual.get(), __LINE__);
+    }  // rep
+#endif  // HWY_TARGET == HWY_SCALAR
+  }     // operator()
+};
+
+HWY_NOINLINE void TestAllCompressBlocks() {
+  ForGE128Vectors<TestCompressBlocks>()(uint64_t());
+}
+
+#endif  // !HWY_PRINT_TABLES
+
+#if HWY_PRINT_TABLES || HWY_IDE
 namespace detail {  // for code folding
+
 void PrintCompress16x8Tables() {
   printf("======================================= 16x8\n");
   constexpr size_t N = 8;  // 128-bit SIMD
@@ -200,11 +279,11 @@ void PrintCompress16x8Tables() {
   printf("\n");
 }
 
-// Similar to the above, but uses native 16-bit shuffle instead of bytes.
-void PrintCompress16x16HalfTables() {
-  printf("======================================= 16x16Half\n");
-  constexpr size_t N = 8;
-  for (uint64_t code = 0; code < (1ull << N); ++code) {
+void PrintCompressNot16x8Tables() {
+  printf("======================================= Not 16x8\n");
+  constexpr size_t N = 8;  // 128-bit SIMD
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
     std::array<uint8_t, N> indices{0};
     size_t pos = 0;
     // All lanes where mask = true
@@ -221,18 +300,19 @@ void PrintCompress16x16HalfTables() {
     }
     HWY_ASSERT(pos == N);
 
+    // Doubled (for converting lane to byte indices)
     for (size_t i = 0; i < N; ++i) {
-      printf("%d,", indices[i]);
+      printf("%d,", 2 * indices[i]);
     }
-    printf(code & 1 ? "//\n" : "/**/");
+    printf(not_code & 1 ? "//\n" : "/**/");
   }
   printf("\n");
 }
 
-// Compressed to nibbles
+// Compressed to nibbles, unpacked via variable right shift
 void PrintCompress32x8Tables() {
-  printf("======================================= 32x8\n");
-  constexpr size_t N = 8;  // AVX2
+  printf("======================================= 32/64x8\n");
+  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
   for (uint64_t code = 0; code < (1ull << N); ++code) {
     std::array<uint32_t, N> indices{0};
     size_t pos = 0;
@@ -263,10 +343,44 @@ void PrintCompress32x8Tables() {
   printf("\n");
 }
 
+void PrintCompressNot32x8Tables() {
+  printf("======================================= Not 32/64x8\n");
+  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
 // Compressed to nibbles (for AVX3 64x4)
 void PrintCompress64x4NibbleTables() {
   printf("======================================= 64x4Nibble\n");
-  constexpr size_t N = 4;
+  constexpr size_t N = 4;  // AVX2
   for (uint64_t code = 0; code < (1ull << N); ++code) {
     std::array<uint32_t, N> indices{0};
     size_t pos = 0;
@@ -297,12 +411,109 @@ void PrintCompress64x4NibbleTables() {
   printf("\n");
 }
 
-// Pairs of 32-bit lane indices
+void PrintCompressNot64x4NibbleTables() {
+  printf("======================================= Not 64x4Nibble\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
 void PrintCompress64x4Tables() {
-  printf("======================================= 64x4\n");
+  printf("======================================= 64x4 uncompressed\n");
+  constexpr size_t N = 4;  // SVE_256
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", static_cast<int>(indices[i]));
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x4Tables() {
+  printf("======================================= Not 64x4 uncompressed\n");
+  constexpr size_t N = 4;  // SVE_256
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", static_cast<int>(indices[i]));
+    }
+  }
+  printf("\n");
+}
+
+// Same as above, but prints pairs of u32 indices (for AVX2)
+void PrintCompress64x4PairTables() {
+  printf("======================================= 64x4 u32 index\n");
   constexpr size_t N = 4;  // AVX2
   for (uint64_t code = 0; code < (1ull << N); ++code) {
-    std::array<uint32_t, N> indices{0};
+    std::array<size_t, N> indices{0};
     size_t pos = 0;
     // All lanes where mask = true
     for (size_t i = 0; i < N; ++i) {
@@ -318,8 +529,46 @@ void PrintCompress64x4Tables() {
     }
     HWY_ASSERT(pos == N);
 
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
     for (size_t i = 0; i < N; ++i) {
-      printf("%d,%d,", 2 * indices[i], 2 * indices[i] + 1);
+      printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
+             static_cast<int>(2 * indices[i]) + 1);
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x4PairTables() {
+  printf("======================================= Not 64x4 u32 index\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
+             static_cast<int>(2 * indices[i]) + 1);
     }
   }
   printf("\n");
@@ -357,6 +606,38 @@ void PrintCompress32x4Tables() {
   printf("\n");
 }
 
+void PrintCompressNot32x4Tables() {
+  printf("======================================= Not 32x4\n");
+  using T = uint32_t;
+  constexpr size_t N = 4;  // SSE4
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%" PRIu64 ",",
+               static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
 // 8-tuple of byte indices
 void PrintCompress64x2Tables() {
   printf("======================================= 64x2\n");
@@ -388,23 +669,63 @@ void PrintCompress64x2Tables() {
   }
   printf("\n");
 }
+
+void PrintCompressNot64x2Tables() {
+  printf("======================================= Not 64x2\n");
+  using T = uint64_t;
+  constexpr size_t N = 2;  // SSE4
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%" PRIu64 ",",
+               static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
 }  // namespace detail
-#endif  // HWY_PRINT_TABLES
 
-HWY_NOINLINE void TestAllCompress() {
-#if HWY_PRINT_TABLES
+HWY_NOINLINE void PrintTables() {
+  // Only print once.
+#if HWY_TARGET == HWY_STATIC_TARGET
   detail::PrintCompress32x8Tables();
+  detail::PrintCompressNot32x8Tables();
   detail::PrintCompress64x4NibbleTables();
+  detail::PrintCompressNot64x4NibbleTables();
   detail::PrintCompress64x4Tables();
+  detail::PrintCompressNot64x4Tables();
   detail::PrintCompress32x4Tables();
+  detail::PrintCompressNot32x4Tables();
   detail::PrintCompress64x2Tables();
+  detail::PrintCompressNot64x2Tables();
+  detail::PrintCompress64x4PairTables();
+  detail::PrintCompressNot64x4PairTables();
   detail::PrintCompress16x8Tables();
-  detail::PrintCompress16x16HalfTables();
+  detail::PrintCompressNot16x8Tables();
 #endif
-
-  ForUIF163264(ForPartialVectors<TestCompress>());
 }
 
+#endif  // HWY_PRINT_TABLES
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
@@ -414,7 +735,13 @@ HWY_AFTER_NAMESPACE();
 
 namespace hwy {
 HWY_BEFORE_TEST(HwyCompressTest);
+#if HWY_PRINT_TABLES
+// Only print instead of running tests; this will be visible in the log.
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, PrintTables);
+#else
 HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompress);
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompressBlocks);
+#endif
 }  // namespace hwy
 
 #endif
diff -pruN 0.17.0-11/hwy/tests/convert_test.cc 1.0.0-2/hwy/tests/convert_test.cc
--- 0.17.0-11/hwy/tests/convert_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/convert_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -16,11 +16,11 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+#include "hwy/base.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/convert_test.cc"
-#include "hwy/foreach_target.h"
-
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -335,8 +335,6 @@ struct TestConvertU8 {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
     const Rebind<uint8_t, D> du8;
-    auto lanes8 = AllocateAligned<uint8_t>(Lanes(du8));
-    Store(Iota(du8, 0), du8, lanes8.get());
     const auto wrap = Set(du32, 0xFF);
     HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(And(Iota(du32, 0), wrap)));
     HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F),
@@ -348,6 +346,42 @@ HWY_NOINLINE void TestAllConvertU8() {
   ForDemoteVectors<TestConvertU8, 2>()(uint32_t());
 }
 
+template <typename From, typename To, class D>
+constexpr bool IsSupportedTruncation() {
+  return (sizeof(To) < sizeof(From)) &&
+         (Pow2(Rebind<To, D>()) + 3 >= static_cast<int>(CeilLog2(sizeof(To))));
+}
+
+struct TestTruncateTo {
+  template <typename From, typename To, class D,
+            hwy::EnableIf<!IsSupportedTruncation<From, To, D>()>* = nullptr>
+  HWY_NOINLINE void testTo(From, To, const D) {
+    // do nothing
+  }
+
+  template <typename From, typename To, class D,
+            hwy::EnableIf<IsSupportedTruncation<From, To, D>()>* = nullptr>
+  HWY_NOINLINE void testTo(From, To, const D d) {
+    constexpr uint32_t base = 0xFA578D00;
+    const Rebind<To, D> dTo;
+    const auto src = Iota(d, static_cast<From>(base));
+    const auto expected = Iota(dTo, static_cast<To>(base));
+    const VFromD<decltype(dTo)> actual = TruncateTo(dTo, src);
+    HWY_ASSERT_VEC_EQ(dTo, expected, actual);
+  }
+
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T from, const D d) {
+    testTo<T, uint8_t, D>(from, uint8_t(), d);
+    testTo<T, uint16_t, D>(from, uint16_t(), d);
+    testTo<T, uint32_t, D>(from, uint32_t(), d);
+  }
+};
+
+HWY_NOINLINE void TestAllTruncate() {
+  ForUnsignedTypes(ForPartialVectors<TestTruncateTo>());
+}
+
 // Separate function to attempt to work around a compiler bug on ARM: when this
 // is merged with TestIntFromFloat, outputs match a previous Iota(-(N+1)) input.
 struct TestIntFromFloatHuge {
@@ -390,7 +424,7 @@ class TestIntFromFloat {
     for (int sign = 0; sign < 2; ++sign) {
       for (size_t shift = 0; shift < kBits - 1; ++shift) {
         for (int64_t ofs : ofs_table) {
-          const int64_t mag = (int64_t(1) << shift) + ofs;
+          const int64_t mag = (int64_t{1} << shift) + ofs;
           const int64_t val = sign ? mag : -mag;
           HWY_ASSERT_VEC_EQ(di, Set(di, static_cast<TI>(val)),
                             ConvertTo(di, Set(df, static_cast<TF>(val))));
@@ -554,6 +588,7 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, Te
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllTruncate);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
diff -pruN 0.17.0-11/hwy/tests/crypto_test.cc 1.0.0-2/hwy/tests/crypto_test.cc
--- 0.17.0-11/hwy/tests/crypto_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/crypto_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -21,7 +21,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/crypto_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
diff -pruN 0.17.0-11/hwy/tests/demote_test.cc 1.0.0-2/hwy/tests/demote_test.cc
--- 0.17.0-11/hwy/tests/demote_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/demote_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -19,8 +19,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/demote_test.cc"
-#include "hwy/foreach_target.h"
-
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -214,7 +213,6 @@ class TestReorderDemote2To {
   template <typename TF32, class DF32>
   HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
 #if HWY_TARGET != HWY_SCALAR
-
     size_t padded;
     auto in = ReorderBF16TestCases(d32, padded);
 
@@ -235,11 +233,12 @@ class TestReorderDemote2To {
       const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
 
       // Smoke test: sum should be same (with tolerance for non-associativity)
-      const auto sum_expected =
+      const auto sum_expected = GetLane(SumOfLanes(d32, Add(f0, f1)));
+      const auto sum_actual =
           GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
-      const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
-      HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
-                 sum_expected <= sum_actual + 1E-4);
+
+      HWY_ASSERT(sum_expected - 1E-4 <= sum_actual &&
+                 sum_actual <= sum_expected + 1E-4);
 
       // Ensure values are the same after sorting to undo the Reorder
       Store(f0, d32, expected.get() + 0);
diff -pruN 0.17.0-11/hwy/tests/float_test.cc 1.0.0-2/hwy/tests/float_test.cc
--- 0.17.0-11/hwy/tests/float_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/float_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -24,7 +24,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/float_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -222,7 +222,7 @@ struct TestNearestInt {
       if (std::isnan(in[i])) {
         // We replace NaN with 0 below (no_nan)
         expected[i] = 0;
-      } else if (std::isinf(in[i]) || double(std::abs(in[i])) >= max) {
+      } else if (std::isinf(in[i]) || double{std::abs(in[i])} >= max) {
         // Avoid undefined result for lrintf
         expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
       } else {
diff -pruN 0.17.0-11/hwy/tests/hwy_gtest.h 1.0.0-2/hwy/tests/hwy_gtest.h
--- 0.17.0-11/hwy/tests/hwy_gtest.h	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/hwy_gtest.h	2022-07-27 11:48:16.000000000 +0000
@@ -44,7 +44,7 @@ namespace hwy {
 //   };
 //   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
 //   TEST_P(MyTestSuite, MyTest) { ... }
-class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
+class TestWithParamTarget : public testing::TestWithParam<int64_t> {
  protected:
   void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
 
@@ -53,7 +53,7 @@ class TestWithParamTarget : public testi
     // was compiled with more than one target. In the single-target case only
     // static dispatch will be used anyway.
 #if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
-    EXPECT_TRUE(SupportedTargetsCalledForTest())
+    EXPECT_TRUE(GetChosenTarget().IsInitialized())
         << "This hwy target parametric test doesn't use dynamic-dispatch and "
            "doesn't need to be parametric.";
 #endif
@@ -64,7 +64,7 @@ class TestWithParamTarget : public testi
 // Function to convert the test parameter of a TestWithParamTarget for
 // displaying it in the gtest test name.
 static inline std::string TestParamTargetName(
-    const testing::TestParamInfo<uint32_t>& info) {
+    const testing::TestParamInfo<int64_t>& info) {
   return TargetName(info.param);
 }
 
@@ -85,7 +85,7 @@ static inline std::string TestParamTarge
 //   TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
 template <typename T>
 class TestWithParamTargetAndT
-    : public ::testing::TestWithParam<std::tuple<uint32_t, T>> {
+    : public ::testing::TestWithParam<std::tuple<int64_t, T>> {
  public:
   // Expose the parametric type here so it can be used by the
   // HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
@@ -94,7 +94,7 @@ class TestWithParamTargetAndT
  protected:
   void SetUp() override {
     SetSupportedTargetsForTest(std::get<0>(
-        ::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam()));
+        ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam()));
   }
 
   void TearDown() override {
@@ -102,7 +102,7 @@ class TestWithParamTargetAndT
     // was compiled with more than one target. In the single-target case only
     // static dispatch will be used anyway.
 #if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
-    EXPECT_TRUE(SupportedTargetsCalledForTest())
+    EXPECT_TRUE(GetChosenTarget().IsInitialized())
         << "This hwy target parametric test doesn't use dynamic-dispatch and "
            "doesn't need to be parametric.";
 #endif
@@ -111,13 +111,13 @@ class TestWithParamTargetAndT
 
   T GetParam() {
     return std::get<1>(
-        ::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam());
+        ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam());
   }
 };
 
 template <typename T>
 std::string TestParamTargetNameAndT(
-    const testing::TestParamInfo<std::tuple<uint32_t, T>>& info) {
+    const testing::TestParamInfo<std::tuple<int64_t, T>>& info) {
   return std::string(TargetName(std::get<0>(info.param))) + "_" +
          ::testing::PrintToString(std::get<1>(info.param));
 }
diff -pruN 0.17.0-11/hwy/tests/if_test.cc 1.0.0-2/hwy/tests/if_test.cc
--- 0.17.0-11/hwy/tests/if_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/hwy/tests/if_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -0,0 +1,175 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hwy/aligned_allocator.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/if_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestIfThenElse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto bool_lanes = AllocateAligned<TI>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = static_cast<T>(Random32(&rng));
+        in2[i] = static_cast<T>(Random32(&rng));
+        bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0);
+      }
+
+      const auto v1 = Load(d, in1.get());
+      const auto v2 = Load(d, in2.get());
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? in1[i] : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? in1[i] : T(0);
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? T(0) : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllIfThenElse() {
+  ForAllTypes(ForPartialVectors<TestIfThenElse>());
+}
+
+struct TestIfVecThenElse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TU = MakeUnsigned<T>;  // For all-one mask
+    const Rebind<TU, D> du;
+    const size_t N = Lanes(d);
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto vec_lanes = AllocateAligned<TU>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = static_cast<T>(Random32(&rng));
+        in2[i] = static_cast<T>(Random32(&rng));
+        vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0);
+      }
+
+      const auto v1 = Load(d, in1.get());
+      const auto v2 = Load(d, in2.get());
+      const auto vec = BitCast(d, Load(du, vec_lanes.get()));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = vec_lanes[i] ? in1[i] : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllIfVecThenElse() {
+  ForAllTypes(ForPartialVectors<TestIfVecThenElse>());
+}
+
+struct TestZeroIfNegative {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp = Iota(d, 1);
+    const auto vn = Iota(d, T(-1E5));  // assumes N < 10^5
+
+    // Zero and positive remain unchanged
+    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0));
+    HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp));
+
+    // Negative are all replaced with zero
+    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn));
+  }
+};
+
+HWY_NOINLINE void TestAllZeroIfNegative() {
+  ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
+}
+
+struct TestIfNegative {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp = Iota(d, 1);
+    const auto vn = Or(vp, SignBit(d));
+
+    // Zero and positive remain unchanged
+    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0));
+    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn));
+    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp));
+    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn));
+
+    // Negative are replaced with 2nd arg
+    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp));
+    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0));
+    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn));
+  }
+};
+
+HWY_NOINLINE void TestAllIfNegative() {
+  ForFloatTypes(ForPartialVectors<TestIfNegative>());
+  ForSignedTypes(ForPartialVectors<TestIfNegative>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyIfTest);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfThenElse);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfVecThenElse);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllZeroIfNegative);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfNegative);
+}  // namespace hwy
+
+#endif
diff -pruN 0.17.0-11/hwy/tests/interleaved_test.cc 1.0.0-2/hwy/tests/interleaved_test.cc
--- 0.17.0-11/hwy/tests/interleaved_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/hwy/tests/interleaved_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -0,0 +1,256 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/interleaved_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLoadStoreInterleaved2 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<T>(2 * N);
+    for (size_t i = 0; i < 2 * N; ++i) {
+      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(3 * N);
+    auto actual_aligned = AllocateAligned<T>(3 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[2 * i + 0] = bytes[0 * N + i];
+        expected[2 * i + 1] = bytes[1 * N + i];
+        // Ensure we do not write more than 2*N bytes
+        expected[2 * N + i] = actual[2 * N + i] = 0;
+      }
+      StoreInterleaved2(in0, in1, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 3 * N * sizeof(T), &pos)) {
+        Print(d, "in0", in0, pos / 4);
+        Print(d, "in1", in1, pos / 4);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
+                static_cast<int>(i), static_cast<double>(actual[i]),
+                static_cast<double>(actual[i + 1]),
+                static_cast<double>(actual[i + 2]),
+                static_cast<double>(actual[i + 3]),
+                static_cast<double>(actual[i + 4]),
+                static_cast<double>(actual[i + 5]),
+                static_cast<double>(actual[i + 6]),
+                static_cast<double>(actual[i + 7]));
+        HWY_ASSERT(false);
+      }
+
+      Vec<D> out0, out1;
+      LoadInterleaved2(d, actual, out0, out1);
+      HWY_ASSERT_VEC_EQ(d, in0, out0);
+      HWY_ASSERT_VEC_EQ(d, in1, out1);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved2() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestLoadStoreInterleaved2, 2> test;
+#else
+  const ForPartialVectors<TestLoadStoreInterleaved2> test;
+#endif
+  ForAllTypes(test);
+}
+
+// Workaround for build timeout on GCC 12 aarch64, see #776
+#if HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_ARCH_ARM_A64
+#define HWY_BROKEN_LOAD34 1
+#else
+#define HWY_BROKEN_LOAD34 0
+#endif
+
+#if !HWY_BROKEN_LOAD34
+
+struct TestLoadStoreInterleaved3 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<T>(3 * N);
+    for (size_t i = 0; i < 3 * N; ++i) {
+      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+    const auto in2 = Load(d, &bytes[2 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(4 * N);
+    auto actual_aligned = AllocateAligned<T>(4 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[3 * i + 0] = bytes[0 * N + i];
+        expected[3 * i + 1] = bytes[1 * N + i];
+        expected[3 * i + 2] = bytes[2 * N + i];
+        // Ensure we do not write more than 3*N bytes
+        expected[3 * N + i] = actual[3 * N + i] = 0;
+      }
+      StoreInterleaved3(in0, in1, in2, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 4 * N * sizeof(T), &pos)) {
+        Print(d, "in0", in0, pos / 3, N);
+        Print(d, "in1", in1, pos / 3, N);
+        Print(d, "in2", in2, pos / 3, N);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved i=%d %f %f %f  %f %f %f\n",
+                static_cast<int>(i), static_cast<double>(actual[i]),
+                static_cast<double>(actual[i + 1]),
+                static_cast<double>(actual[i + 2]),
+                static_cast<double>(actual[i + 3]),
+                static_cast<double>(actual[i + 4]),
+                static_cast<double>(actual[i + 5]));
+        HWY_ASSERT(false);
+      }
+
+      Vec<D> out0, out1, out2;
+      LoadInterleaved3(d, actual, out0, out1, out2);
+      HWY_ASSERT_VEC_EQ(d, in0, out0);
+      HWY_ASSERT_VEC_EQ(d, in1, out1);
+      HWY_ASSERT_VEC_EQ(d, in2, out2);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved3() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestLoadStoreInterleaved3, 2> test;
+#else
+  const ForPartialVectors<TestLoadStoreInterleaved3> test;
+#endif
+  ForAllTypes(test);
+}
+
+struct TestLoadStoreInterleaved4 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<T>(4 * N);
+
+    for (size_t i = 0; i < 4 * N; ++i) {
+      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+    const auto in2 = Load(d, &bytes[2 * N]);
+    const auto in3 = Load(d, &bytes[3 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(5 * N);
+    auto actual_aligned = AllocateAligned<T>(5 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[4 * i + 0] = bytes[0 * N + i];
+        expected[4 * i + 1] = bytes[1 * N + i];
+        expected[4 * i + 2] = bytes[2 * N + i];
+        expected[4 * i + 3] = bytes[3 * N + i];
+        // Ensure we do not write more than 4*N bytes
+        expected[4 * N + i] = actual[4 * N + i] = 0;
+      }
+      StoreInterleaved4(in0, in1, in2, in3, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 5 * N * sizeof(T), &pos)) {
+        Print(d, "in0", in0, pos / 4);
+        Print(d, "in1", in1, pos / 4);
+        Print(d, "in2", in2, pos / 4);
+        Print(d, "in3", in3, pos / 4);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
+                static_cast<int>(i), static_cast<double>(actual[i]),
+                static_cast<double>(actual[i + 1]),
+                static_cast<double>(actual[i + 2]),
+                static_cast<double>(actual[i + 3]),
+                static_cast<double>(actual[i + 4]),
+                static_cast<double>(actual[i + 5]),
+                static_cast<double>(actual[i + 6]),
+                static_cast<double>(actual[i + 7]));
+        HWY_ASSERT(false);
+      }
+
+      Vec<D> out0, out1, out2, out3;
+      LoadInterleaved4(d, actual, out0, out1, out2, out3);
+      HWY_ASSERT_VEC_EQ(d, in0, out0);
+      HWY_ASSERT_VEC_EQ(d, in1, out1);
+      HWY_ASSERT_VEC_EQ(d, in2, out2);
+      HWY_ASSERT_VEC_EQ(d, in3, out3);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved4() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestLoadStoreInterleaved4, 2> test;
+#else
+  const ForPartialVectors<TestLoadStoreInterleaved4> test;
+#endif
+  ForAllTypes(test);
+}
+
+#endif  // !HWY_BROKEN_LOAD34
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyInterleavedTest);
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved2);
+#if !HWY_BROKEN_LOAD34
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved3);
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved4);
+#endif
+}  // namespace hwy
+
+#endif
diff -pruN 0.17.0-11/hwy/tests/list_targets.cc 1.0.0-2/hwy/tests/list_targets.cc
--- 0.17.0-11/hwy/tests/list_targets.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/list_targets.cc	2022-07-27 11:48:16.000000000 +0000
@@ -20,10 +20,10 @@
 
 #include "hwy/highway.h"
 
-void PrintTargets(const char* msg, uint32_t targets) {
+void PrintTargets(const char* msg, int64_t targets) {
   fprintf(stderr, "%s", msg);
   // For each bit:
-  for (uint32_t x = targets; x != 0; x = x & (x - 1)) {
+  for (int64_t x = targets; x != 0; x = x & (x - 1)) {
     // Extract value of least-significant bit.
     fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1)));
   }
@@ -31,8 +31,41 @@ void PrintTargets(const char* msg, uint3
 }
 
 int main() {
-  PrintTargets("Compiled HWY_TARGETS:", HWY_TARGETS);
-  PrintTargets("HWY_BASELINE_TARGETS:", HWY_BASELINE_TARGETS);
-  PrintTargets("Current CPU supports:", hwy::SupportedTargets());
+#ifdef HWY_COMPILE_ONLY_EMU128
+  const int only_emu128 = 1;
+#else
+  const int only_emu128 = 0;
+#endif
+#ifdef HWY_COMPILE_ONLY_SCALAR
+  const int only_scalar = 1;
+#else
+  const int only_scalar = 0;
+#endif
+#ifdef HWY_COMPILE_ONLY_STATIC
+  const int only_static = 1;
+#else
+  const int only_static = 0;
+#endif
+#ifdef HWY_COMPILE_ALL_ATTAINABLE
+  const int all_attain = 1;
+#else
+  const int all_attain = 0;
+#endif
+#ifdef HWY_IS_TEST
+  const int is_test = 1;
+#else
+  const int is_test = 0;
+#endif
+
+  fprintf(stderr,
+          "Config: emu128:%d scalar:%d static:%d all_attain:%d is_test:%d\n",
+          only_emu128, only_scalar, only_static, all_attain, is_test);
+  PrintTargets("Compiled HWY_TARGETS:  ", HWY_TARGETS);
+  PrintTargets("HWY_ATTAINABLE_TARGETS:", HWY_ATTAINABLE_TARGETS);
+  PrintTargets("HWY_BASELINE_TARGETS:  ", HWY_BASELINE_TARGETS);
+  PrintTargets("HWY_STATIC_TARGET:     ", HWY_STATIC_TARGET);
+  PrintTargets("HWY_BROKEN_TARGETS:    ", HWY_BROKEN_TARGETS);
+  PrintTargets("HWY_DISABLED_TARGETS:  ", HWY_DISABLED_TARGETS);
+  PrintTargets("Current CPU supports:  ", hwy::SupportedTargets());
   return 0;
 }
diff -pruN 0.17.0-11/hwy/tests/logical_test.cc 1.0.0-2/hwy/tests/logical_test.cc
--- 0.17.0-11/hwy/tests/logical_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/logical_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -21,7 +21,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/logical_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -174,92 +174,10 @@ struct TestCopySign {
   }
 };
 
-struct TestIfVecThenElse {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TU = MakeUnsigned<T>;  // For all-one mask
-    const Rebind<TU, D> du;
-    const size_t N = Lanes(d);
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto vec_lanes = AllocateAligned<TU>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = static_cast<T>(Random32(&rng));
-        in2[i] = static_cast<T>(Random32(&rng));
-        vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0);
-      }
-
-      const auto v1 = Load(d, in1.get());
-      const auto v2 = Load(d, in2.get());
-      const auto vec = BitCast(d, Load(du, vec_lanes.get()));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = vec_lanes[i] ? in1[i] : in2[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllIfVecThenElse() {
-  ForAllTypes(ForPartialVectors<TestIfVecThenElse>());
-}
-
 HWY_NOINLINE void TestAllCopySign() {
   ForFloatTypes(ForPartialVectors<TestCopySign>());
 }
 
-struct TestZeroIfNegative {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vp = Iota(d, 1);
-    const auto vn = Iota(d, T(-1E5));  // assumes N < 10^5
-
-    // Zero and positive remain unchanged
-    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0));
-    HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp));
-
-    // Negative are all replaced with zero
-    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn));
-  }
-};
-
-HWY_NOINLINE void TestAllZeroIfNegative() {
-  ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
-}
-
-struct TestIfNegative {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vp = Iota(d, 1);
-    const auto vn = Or(vp, SignBit(d));
-
-    // Zero and positive remain unchanged
-    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn));
-    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn));
-
-    // Negative are replaced with 2nd arg
-    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0));
-    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn));
-  }
-};
-
-HWY_NOINLINE void TestAllIfNegative() {
-  ForFloatTypes(ForPartialVectors<TestIfNegative>());
-  ForSignedTypes(ForPartialVectors<TestIfNegative>());
-}
-
 struct TestBroadcastSignBit {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -343,10 +261,7 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyLogicalTest);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfVecThenElse);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfNegative);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount);
diff -pruN 0.17.0-11/hwy/tests/mask_mem_test.cc 1.0.0-2/hwy/tests/mask_mem_test.cc
--- 0.17.0-11/hwy/tests/mask_mem_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/hwy/tests/mask_mem_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -0,0 +1,194 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcmp
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/mask_mem_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestMaskedLoad {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    auto lanes = AllocateAligned<T>(N);
+    Store(Iota(d, T{1}), d, lanes.get());
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      const auto expected = IfThenElseZero(mask, Load(d, lanes.get()));
+      const auto actual = MaskedLoad(mask, d, lanes.get());
+      HWY_ASSERT_VEC_EQ(d, expected, actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMaskedLoad() {
+  ForAllTypes(ForPartialVectors<TestMaskedLoad>());
+}
+
+struct TestBlendedStore {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    const Vec<D> v = Iota(d, T{1});
+    auto actual = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+        // Re-initialize to something distinct from v[i].
+        actual[i] = static_cast<T>(127 - (i & 127));
+        expected[i] = bool_lanes[i] ? static_cast<T>(i + 1) : actual[i];
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      BlendedStore(v, mask, d, actual.get());
+      HWY_ASSERT_VEC_EQ(d, expected.get(), Load(d, actual.get()));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllBlendedStore() {
+  ForAllTypes(ForPartialVectors<TestBlendedStore>());
+}
+
+class TestStoreMaskBits {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*t*/, D /*d*/) {
+    RandomState rng;
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(di);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    const ScalableTag<uint8_t, -3> d_bits;
+    const size_t expected_num_bytes = (N + 7) / 8;
+    auto expected = AllocateAligned<uint8_t>(expected_num_bytes);
+    auto actual = AllocateAligned<uint8_t>(HWY_MAX(8, expected_num_bytes));
+
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      // Generate random mask pattern.
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = static_cast<TI>((rng() & 1024) ? 1 : 0);
+      }
+      const auto bools = Load(di, bool_lanes.get());
+      const auto mask = Gt(bools, Zero(di));
+
+      // Requires at least 8 bytes, ensured above.
+      const size_t bytes_written = StoreMaskBits(di, mask, actual.get());
+      if (bytes_written != expected_num_bytes) {
+        fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n",
+                TypeName(T(), N).c_str(),
+                static_cast<uint64_t>(expected_num_bytes),
+                static_cast<uint64_t>(bytes_written));
+
+        HWY_ASSERT(false);
+      }
+
+      // Requires at least 8 bytes, ensured above.
+      const auto mask2 = LoadMaskBits(di, actual.get());
+      HWY_ASSERT_MASK_EQ(di, mask, mask2);
+
+      memset(expected.get(), 0, expected_num_bytes);
+      for (size_t i = 0; i < N; ++i) {
+        expected[i / 8] =
+            static_cast<uint8_t>(expected[i / 8] | (bool_lanes[i] << (i % 8)));
+      }
+
+      size_t i = 0;
+      // Stored bits must match original mask
+      for (; i < N; ++i) {
+        const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0;
+        if (is_set != bool_lanes[i]) {
+          fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n",
+                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i),
+                  static_cast<int>(bool_lanes[i]), static_cast<int>(is_set));
+          Print(di, "bools", bools, 0, N);
+          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
+                expected_num_bytes);
+          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
+                expected_num_bytes);
+
+          HWY_ASSERT(false);
+        }
+      }
+      // Any partial bits in the last byte must be zero
+      for (; i < 8 * bytes_written; ++i) {
+        const int bit = (actual[i / 8] & (1 << (i % 8)));
+        if (bit != 0) {
+          fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n",
+                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i));
+          Print(di, "bools", bools, 0, N);
+          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
+                expected_num_bytes);
+          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
+                expected_num_bytes);
+
+          HWY_ASSERT(false);
+        }
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllStoreMaskBits() {
+  ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMaskTest);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBlendedStore);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits);
+}  // namespace hwy
+
+#endif
diff -pruN 0.17.0-11/hwy/tests/mask_test.cc 1.0.0-2/hwy/tests/mask_test.cc
--- 0.17.0-11/hwy/tests/mask_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/mask_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -20,8 +20,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/mask_test.cc"
-#include "hwy/foreach_target.h"
-
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -83,53 +82,6 @@ HWY_NOINLINE void TestAllFirstN() {
   ForAllTypes(ForPartialVectors<TestFirstN>());
 }
 
-struct TestIfThenElse {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto bool_lanes = AllocateAligned<TI>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = static_cast<T>(Random32(&rng));
-        in2[i] = static_cast<T>(Random32(&rng));
-        bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0);
-      }
-
-      const auto v1 = Load(d, in1.get());
-      const auto v2 = Load(d, in2.get());
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = bool_lanes[i] ? in1[i] : in2[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = bool_lanes[i] ? in1[i] : T(0);
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = bool_lanes[i] ? T(0) : in2[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllIfThenElse() {
-  ForAllTypes(ForPartialVectors<TestIfThenElse>());
-}
-
 struct TestMaskVec {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -162,71 +114,6 @@ HWY_NOINLINE void TestAllMaskVec() {
   ForUIF3264(test);
 }
 
-struct TestMaskedLoad {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-    auto bool_lanes = AllocateAligned<TI>(N);
-
-    auto lanes = AllocateAligned<T>(N);
-    Store(Iota(d, T{1}), d, lanes.get());
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
-      }
-
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      const auto expected = IfThenElseZero(mask, Load(d, lanes.get()));
-      const auto actual = MaskedLoad(mask, d, lanes.get());
-      HWY_ASSERT_VEC_EQ(d, expected, actual);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllMaskedLoad() {
-  ForAllTypes(ForPartialVectors<TestMaskedLoad>());
-}
-
-struct TestBlendedStore {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-    auto bool_lanes = AllocateAligned<TI>(N);
-
-    const Vec<D> v = Iota(d, T{1});
-    auto actual = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
-        // Re-initialize to something distinct from v[i].
-        actual[i] = static_cast<T>(127 - (i & 127));
-        expected[i] = bool_lanes[i] ? static_cast<T>(i + 1) : actual[i];
-      }
-
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      BlendedStore(v, mask, d, actual.get());
-      HWY_ASSERT_VEC_EQ(d, expected.get(), Load(d, actual.get()));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllBlendedStore() {
-  ForAllTypes(ForPartialVectors<TestBlendedStore>());
-}
-
 struct TestAllTrueFalse {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -237,8 +124,6 @@ struct TestAllTrueFalse {
     auto lanes = AllocateAligned<T>(N);
     std::fill(lanes.get(), lanes.get() + N, T(0));
 
-    auto mask_lanes = AllocateAligned<T>(N);
-
     HWY_ASSERT(AllTrue(d, Eq(v, zero)));
     HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
 
@@ -251,11 +136,7 @@ struct TestAllTrueFalse {
       lanes[i] = T(1);
       v = Load(d, lanes.get());
 
-      // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
-      // Assigning to an lvalue is insufficient but storing to memory prevents
-      // the bug; so does Print of VecFromMask(d, Eq(v, zero)).
-      Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
-      HWY_ASSERT(!AllTrue(d, MaskFromVec(Load(d, mask_lanes.get()))));
+      HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
 
       HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
 
@@ -277,89 +158,6 @@ HWY_NOINLINE void TestAllAllTrueFalse()
   ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
 }
 
-class TestStoreMaskBits {
- public:
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*t*/, D /*d*/) {
-    RandomState rng;
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(di);
-    auto bool_lanes = AllocateAligned<TI>(N);
-
-    const ScalableTag<uint8_t, -3> d_bits;
-    const size_t expected_num_bytes = (N + 7) / 8;
-    auto expected = AllocateAligned<uint8_t>(expected_num_bytes);
-    auto actual = AllocateAligned<uint8_t>(HWY_MAX(8, expected_num_bytes));
-
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      // Generate random mask pattern.
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = static_cast<TI>((rng() & 1024) ? 1 : 0);
-      }
-      const auto bools = Load(di, bool_lanes.get());
-      const auto mask = Gt(bools, Zero(di));
-
-      // Requires at least 8 bytes, ensured above.
-      const size_t bytes_written = StoreMaskBits(di, mask, actual.get());
-      if (bytes_written != expected_num_bytes) {
-        fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n",
-                TypeName(T(), N).c_str(),
-                static_cast<uint64_t>(expected_num_bytes),
-                static_cast<uint64_t>(bytes_written));
-
-        HWY_ASSERT(false);
-      }
-
-      // Requires at least 8 bytes, ensured above.
-      const auto mask2 = LoadMaskBits(di, actual.get());
-      HWY_ASSERT_MASK_EQ(di, mask, mask2);
-
-      memset(expected.get(), 0, expected_num_bytes);
-      for (size_t i = 0; i < N; ++i) {
-        expected[i / 8] = uint8_t(expected[i / 8] | (bool_lanes[i] << (i % 8)));
-      }
-
-      size_t i = 0;
-      // Stored bits must match original mask
-      for (; i < N; ++i) {
-        const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0;
-        if (is_set != bool_lanes[i]) {
-          fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n",
-                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i),
-                  int(bool_lanes[i]), int(is_set));
-          Print(di, "bools", bools, 0, N);
-          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
-                expected_num_bytes);
-          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
-                expected_num_bytes);
-
-          HWY_ASSERT(false);
-        }
-      }
-      // Any partial bits in the last byte must be zero
-      for (; i < 8 * bytes_written; ++i) {
-        const int bit = (actual[i / 8] & (1 << (i % 8)));
-        if (bit != 0) {
-          fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n",
-                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i));
-          Print(di, "bools", bools, 0, N);
-          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
-                expected_num_bytes);
-          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
-                expected_num_bytes);
-
-          HWY_ASSERT(false);
-        }
-      }
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllStoreMaskBits() {
-  ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
-}
-
 struct TestCountTrue {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -412,8 +210,8 @@ struct TestFindFirstTrue {
         bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
       }
 
-      const intptr_t expected =
-          static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero32(uint32_t(code)));
+      const intptr_t expected = static_cast<intptr_t>(
+          Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code)));
       const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
       const intptr_t actual = FindFirstTrue(d, mask);
       HWY_ASSERT_EQ(expected, actual);
@@ -480,12 +278,8 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyMaskTest);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllIfThenElse);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBlendedStore);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask);
diff -pruN 0.17.0-11/hwy/tests/memory_test.cc 1.0.0-2/hwy/tests/memory_test.cc
--- 0.17.0-11/hwy/tests/memory_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/memory_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -26,7 +26,7 @@
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/memory_test.cc"
 #include "hwy/cache_control.h"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -132,206 +132,6 @@ HWY_NOINLINE void TestAllSafeCopyN() {
   ForAllTypes(ForPartialVectors<TestSafeCopyN>());
 }
 
-struct TestLoadStoreInterleaved2 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-
-    RandomState rng;
-
-    // Data to be interleaved
-    auto bytes = AllocateAligned<T>(2 * N);
-    for (size_t i = 0; i < 2 * N; ++i) {
-      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
-    }
-    const auto in0 = Load(d, &bytes[0 * N]);
-    const auto in1 = Load(d, &bytes[1 * N]);
-
-    // Interleave here, ensure vector results match scalar
-    auto expected = AllocateAligned<T>(3 * N);
-    auto actual_aligned = AllocateAligned<T>(3 * N + 1);
-    T* actual = actual_aligned.get() + 1;
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        expected[2 * i + 0] = bytes[0 * N + i];
-        expected[2 * i + 1] = bytes[1 * N + i];
-        // Ensure we do not write more than 2*N bytes
-        expected[2 * N + i] = actual[2 * N + i] = 0;
-      }
-      StoreInterleaved2(in0, in1, d, actual);
-      size_t pos = 0;
-      if (!BytesEqual(expected.get(), actual, 3 * N * sizeof(T), &pos)) {
-        Print(d, "in0", in0, pos / 4);
-        Print(d, "in1", in1, pos / 4);
-        const size_t i = pos;
-        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
-                static_cast<int>(i), static_cast<double>(actual[i]),
-                static_cast<double>(actual[i + 1]),
-                static_cast<double>(actual[i + 2]),
-                static_cast<double>(actual[i + 3]),
-                static_cast<double>(actual[i + 4]),
-                static_cast<double>(actual[i + 5]),
-                static_cast<double>(actual[i + 6]),
-                static_cast<double>(actual[i + 7]));
-        HWY_ASSERT(false);
-      }
-
-      Vec<D> out0, out1;
-      LoadInterleaved2(d, actual, out0, out1);
-      HWY_ASSERT_VEC_EQ(d, in0, out0);
-      HWY_ASSERT_VEC_EQ(d, in1, out1);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLoadStoreInterleaved2() {
-#if HWY_TARGET == HWY_RVV
-  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestLoadStoreInterleaved2, 2> test;
-#else
-  const ForPartialVectors<TestLoadStoreInterleaved2> test;
-#endif
-  ForAllTypes(test);
-}
-
-struct TestLoadStoreInterleaved3 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-
-    RandomState rng;
-
-    // Data to be interleaved
-    auto bytes = AllocateAligned<T>(3 * N);
-    for (size_t i = 0; i < 3 * N; ++i) {
-      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
-    }
-    const auto in0 = Load(d, &bytes[0 * N]);
-    const auto in1 = Load(d, &bytes[1 * N]);
-    const auto in2 = Load(d, &bytes[2 * N]);
-
-    // Interleave here, ensure vector results match scalar
-    auto expected = AllocateAligned<T>(4 * N);
-    auto actual_aligned = AllocateAligned<T>(4 * N + 1);
-    T* actual = actual_aligned.get() + 1;
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        expected[3 * i + 0] = bytes[0 * N + i];
-        expected[3 * i + 1] = bytes[1 * N + i];
-        expected[3 * i + 2] = bytes[2 * N + i];
-        // Ensure we do not write more than 3*N bytes
-        expected[3 * N + i] = actual[3 * N + i] = 0;
-      }
-      StoreInterleaved3(in0, in1, in2, d, actual);
-      size_t pos = 0;
-      if (!BytesEqual(expected.get(), actual, 4 * N * sizeof(T), &pos)) {
-        Print(d, "in0", in0, pos / 3, N);
-        Print(d, "in1", in1, pos / 3, N);
-        Print(d, "in2", in2, pos / 3, N);
-        const size_t i = pos;
-        fprintf(stderr, "interleaved i=%d %f %f %f  %f %f %f\n",
-                static_cast<int>(i), static_cast<double>(actual[i]),
-                static_cast<double>(actual[i + 1]),
-                static_cast<double>(actual[i + 2]),
-                static_cast<double>(actual[i + 3]),
-                static_cast<double>(actual[i + 4]),
-                static_cast<double>(actual[i + 5]));
-        HWY_ASSERT(false);
-      }
-
-      Vec<D> out0, out1, out2;
-      LoadInterleaved3(d, actual, out0, out1, out2);
-      HWY_ASSERT_VEC_EQ(d, in0, out0);
-      HWY_ASSERT_VEC_EQ(d, in1, out1);
-      HWY_ASSERT_VEC_EQ(d, in2, out2);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLoadStoreInterleaved3() {
-#if HWY_TARGET == HWY_RVV
-  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestLoadStoreInterleaved3, 2> test;
-#else
-  const ForPartialVectors<TestLoadStoreInterleaved3> test;
-#endif
-  ForAllTypes(test);
-}
-
-struct TestLoadStoreInterleaved4 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-
-    RandomState rng;
-
-    // Data to be interleaved
-    auto bytes = AllocateAligned<T>(4 * N);
-
-    for (size_t i = 0; i < 4 * N; ++i) {
-      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
-    }
-    const auto in0 = Load(d, &bytes[0 * N]);
-    const auto in1 = Load(d, &bytes[1 * N]);
-    const auto in2 = Load(d, &bytes[2 * N]);
-    const auto in3 = Load(d, &bytes[3 * N]);
-
-    // Interleave here, ensure vector results match scalar
-    auto expected = AllocateAligned<T>(5 * N);
-    auto actual_aligned = AllocateAligned<T>(5 * N + 1);
-    T* actual = actual_aligned.get() + 1;
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        expected[4 * i + 0] = bytes[0 * N + i];
-        expected[4 * i + 1] = bytes[1 * N + i];
-        expected[4 * i + 2] = bytes[2 * N + i];
-        expected[4 * i + 3] = bytes[3 * N + i];
-        // Ensure we do not write more than 4*N bytes
-        expected[4 * N + i] = actual[4 * N + i] = 0;
-      }
-      StoreInterleaved4(in0, in1, in2, in3, d, actual);
-      size_t pos = 0;
-      if (!BytesEqual(expected.get(), actual, 5 * N * sizeof(T), &pos)) {
-        Print(d, "in0", in0, pos / 4);
-        Print(d, "in1", in1, pos / 4);
-        Print(d, "in2", in2, pos / 4);
-        Print(d, "in3", in3, pos / 4);
-        const size_t i = pos;
-        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
-                static_cast<int>(i), static_cast<double>(actual[i]),
-                static_cast<double>(actual[i + 1]),
-                static_cast<double>(actual[i + 2]),
-                static_cast<double>(actual[i + 3]),
-                static_cast<double>(actual[i + 4]),
-                static_cast<double>(actual[i + 5]),
-                static_cast<double>(actual[i + 6]),
-                static_cast<double>(actual[i + 7]));
-        HWY_ASSERT(false);
-      }
-
-      Vec<D> out0, out1, out2, out3;
-      LoadInterleaved4(d, actual, out0, out1, out2, out3);
-      HWY_ASSERT_VEC_EQ(d, in0, out0);
-      HWY_ASSERT_VEC_EQ(d, in1, out1);
-      HWY_ASSERT_VEC_EQ(d, in2, out2);
-      HWY_ASSERT_VEC_EQ(d, in3, out3);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLoadStoreInterleaved4() {
-#if HWY_TARGET == HWY_RVV
-  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestLoadStoreInterleaved4, 2> test;
-#else
-  const ForPartialVectors<TestLoadStoreInterleaved4> test;
-#endif
-  ForAllTypes(test);
-}
-
 struct TestLoadDup128 {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -531,9 +331,6 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyMemoryTest);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStoreInterleaved2);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStoreInterleaved3);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStoreInterleaved4);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
diff -pruN 0.17.0-11/hwy/tests/mul_test.cc 1.0.0-2/hwy/tests/mul_test.cc
--- 0.17.0-11/hwy/tests/mul_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/mul_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -19,7 +19,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/mul_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -59,7 +59,7 @@ struct TestUnsignedMul {
 
     const size_t bits = sizeof(T) * 8;
     const uint64_t mask = (1ull << bits) - 1;
-    const T max2 = (uint64_t(max) * max) & mask;
+    const T max2 = (static_cast<uint64_t>(max) * max) & mask;
     HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
   }
 };
diff -pruN 0.17.0-11/hwy/tests/reduction_test.cc 1.0.0-2/hwy/tests/reduction_test.cc
--- 0.17.0-11/hwy/tests/reduction_test.cc	1970-01-01 00:00:00.000000000 +0000
+++ 1.0.0-2/hwy/tests/reduction_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -0,0 +1,171 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/reduction_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestSumOfLanes {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+
+    // Lane i = bit i, higher lanes 0
+    double sum = 0.0;
+    // Avoid setting sign bit and cap at double precision
+    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
+      sum += static_cast<double>(in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
+                      SumOfLanes(d, Load(d, in_lanes.get())));
+
+    // Lane i = i (iota) to include upper lanes
+    sum = 0.0;
+    for (size_t i = 0; i < N; ++i) {
+      sum += static_cast<double>(i);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
+  }
+};
+
+HWY_NOINLINE void TestAllSumOfLanes() {
+  ForUIF3264(ForPartialVectors<TestSumOfLanes>());
+}
+
+struct TestMinOfLanes {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+
+    // Lane i = bit i, higher lanes = 2 (not the minimum)
+    T min = HighestValue<T>();
+    // Avoid setting sign bit and cap at double precision
+    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
+      min = HWY_MIN(min, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+
+    // Lane i = N - i to include upper lanes
+    min = HighestValue<T>();
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = static_cast<T>(N - i);  // no 8-bit T so no wraparound
+      min = HWY_MIN(min, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+  }
+};
+
+struct TestMaxOfLanes {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+
+    T max = LowestValue<T>();
+    // Avoid setting sign bit and cap at double precision
+    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
+      max = HWY_MAX(max, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+
+    // Lane i = i to include upper lanes
+    max = LowestValue<T>();
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = static_cast<T>(i);  // no 8-bit T so no wraparound
+      max = HWY_MAX(max, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+  }
+};
+
+HWY_NOINLINE void TestAllMinMaxOfLanes() {
+  const ForPartialVectors<TestMinOfLanes> test_min;
+  const ForPartialVectors<TestMaxOfLanes> test_max;
+  ForUIF3264(test_min);
+  ForUIF3264(test_max);
+  test_min(uint16_t());
+  test_max(uint16_t());
+  test_min(int16_t());
+  test_max(int16_t());
+}
+
+struct TestSumsOf8 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    const size_t N = Lanes(d);
+    if (N < 8) return;
+    const Repartition<uint64_t, D> du64;
+
+    auto in_lanes = AllocateAligned<T>(N);
+    auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in_lanes[i] = Random64(&rng) & 0xFF;
+      }
+
+      for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
+        uint64_t sum = 0;
+        for (size_t i = 0; i < 8; ++i) {
+          sum += in_lanes[idx_sum * 8 + i];
+        }
+        sum_lanes[idx_sum] = sum;
+      }
+
+      const Vec<D> in = Load(d, in_lanes.get());
+      HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllSumsOf8() {
+  ForGEVectors<64, TestSumsOf8>()(uint8_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyReductionTest);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumOfLanes);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8);
+}  // namespace hwy
+
+#endif
diff -pruN 0.17.0-11/hwy/tests/reverse_test.cc 1.0.0-2/hwy/tests/reverse_test.cc
--- 0.17.0-11/hwy/tests/reverse_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/reverse_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -19,7 +19,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/reverse_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
diff -pruN 0.17.0-11/hwy/tests/shift_test.cc 1.0.0-2/hwy/tests/shift_test.cc
--- 0.17.0-11/hwy/tests/shift_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/shift_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -22,7 +22,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/shift_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -44,7 +44,8 @@ struct TestLeftShifts {
     const size_t N = Lanes(d);
     auto expected = AllocateAligned<T>(N);
 
-    const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
+    // Values to shift
+    const auto values = Iota(d, static_cast<T>(kSigned ? -TI(N) : TI(0)));
     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
 
     // 0
diff -pruN 0.17.0-11/hwy/tests/swizzle_test.cc 1.0.0-2/hwy/tests/swizzle_test.cc
--- 0.17.0-11/hwy/tests/swizzle_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/swizzle_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -20,7 +20,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
diff -pruN 0.17.0-11/hwy/tests/test_util_test.cc 1.0.0-2/hwy/tests/test_util_test.cc
--- 0.17.0-11/hwy/tests/test_util_test.cc	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/hwy/tests/test_util_test.cc	2022-07-27 11:48:16.000000000 +0000
@@ -18,7 +18,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
diff -pruN 0.17.0-11/README.md 1.0.0-2/README.md
--- 0.17.0-11/README.md	2022-06-02 15:30:41.000000000 +0000
+++ 1.0.0-2/README.md	2022-07-27 11:48:16.000000000 +0000
@@ -83,19 +83,19 @@ incrementing MINOR after backward-compat
 backward-compatible fixes. We recommend using releases (rather than the Git tip)
 because they are tested more extensively, see below.
 
-Version 0.11 is considered stable enough to use in other projects.
-Version 1.0 will signal an increased focus on backwards compatibility and is
-planned for 2022H1 now that all targets are feature-complete.
+The current version 1.0 signals an increased focus on backwards compatibility.
+Applications using documented functionality will remain compatible with future
+updates that have the same major version number.
 
 ### Testing
 
 Continuous integration tests build with a recent version of Clang (running on
-native x86, Spike for RVV, and QEMU for ARM) and MSVC from VS2015 (running on
-native x86).
+native x86, or QEMU for RVV and ARM) and MSVC 2019 (v19.28, running on native
+x86).
 
-Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
-GCC cross-compile and QEMU. See the
-[testing process](g3doc/release_testing_process.md) for details.
+Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via GCC
+cross-compile. See the [testing process](g3doc/release_testing_process.md) for
+details.
 
 ### Related modules
 
